v Term Frequency Inverse Document Frequency - Machine Learning

Term Frequency Inverse Document Frequency

Preliminaries

# Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

Create Text Data

# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

Create Feature Matrix

# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

# Show tf-idf feature matrix
feature_matrix.toarray()
array([[ 0.        ,  0.        ,  0.        ,  0.89442719,  0.        ,
         0.        ,  0.4472136 ,  0.        ],
       [ 0.        ,  0.57735027,  0.        ,  0.        ,  0.        ,
         0.57735027,  0.        ,  0.57735027],
       [ 0.57735027,  0.        ,  0.57735027,  0.        ,  0.57735027,
         0.        ,  0.        ,  0.        ]])
# Show tf-idf feature matrix
tfidf.get_feature_names()
['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

View Feature Matrix As Data Frame

# Create data frame
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
beats best both brazil germany is love sweden
0 0.00000 0.00000 0.00000 0.894427 0.00000 0.00000 0.447214 0.00000
1 0.00000 0.57735 0.00000 0.000000 0.00000 0.57735 0.000000 0.57735
2 0.57735 0.00000 0.57735 0.000000 0.57735 0.00000 0.000000 0.00000