v Bag Of Words - Machine Learning

Bag Of Words

Preliminaries

# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

Create Text Data

# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

Create Bag Of Words

# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# Show feature matrix
bag_of_words.toarray()
array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

View Bag Of Words Matrix Column Headers

# Get feature names
feature_names = count.get_feature_names()

# View feature names
feature_names
['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

View As A Data Frame

# Create data frame
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
beats best both brazil germany is love sweden
0 0 0 0 2 0 0 1 0
1 0 1 0 0 0 1 0 1
2 1 0 1 0 1 0 0 0