TF-IDF example#
Function for calculating the relative frequency (term frequency or tf)#
# Example documents
docs = [
"this is a sample document",
"this document is another example",
"this example document is different"
]
# Calulate tf
def compute_tf(doc):
tf_dict = {}
words = doc.split()
total_words = len(words)
for word in words:
tf_dict[word] = tf_dict.get(word, 0) + 1
for word in tf_dict:
tf_dict[word] = tf_dict[word] / total_words
return tf_dict
for doc in docs:
print (compute_tf(doc))
{'this': 0.2, 'is': 0.2, 'a': 0.2, 'sample': 0.2, 'document': 0.2}
{'this': 0.2, 'document': 0.2, 'is': 0.2, 'another': 0.2, 'example': 0.2}
{'this': 0.2, 'example': 0.2, 'document': 0.2, 'is': 0.2, 'different': 0.2}
Function to calculate the inverse document frequency (inverse document frequency or idf)#
IDF is a measure of how important a word is to one document in a collection (or corpus) of documents. The intuition behind IDF is that words that appear in many documents are less informative than those that appear in fewer documents.
docs = [
"this is a sample document",
"this document is another example",
"this example document is different"
]
import math
# Calculate idf
def compute_idf(docs):
idf_dict = {}
total_docs = len(docs)
for doc in docs:
words = set(doc.split())
for word in words:
idf_dict[word] = idf_dict.get(word, 0) + 1
for word in idf_dict:
idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
return idf_dict
print (compute_idf(docs))
{'document': 0.0, 'is': 0.0, 'sample': 1.0986122886681098, 'a': 1.0986122886681098, 'this': 0.0, 'example': 0.4054651081081644, 'another': 1.0986122886681098, 'different': 1.0986122886681098}
Functions to calculate tf-idf#
TF-IDF stands for Term Frequency-Inverse Document Frequency and is used to assess the importance of a word in a document in relation to a collection of documents (corpus).
docs = [
"this is a sample document",
"this document is another example",
"this example document is different"
]
import math
# Step 1: compute TF
def compute_tf(doc):
tf_dict = {}
words = doc.split()
total_words = len(words)
for word in words:
tf_dict[word] = tf_dict.get(word, 0) + 1
for word in tf_dict:
tf_dict[word] = tf_dict[word] / total_words
return tf_dict
# Step 2: compute IDF
def compute_idf(docs):
idf_dict = {}
total_docs = len(docs)
for doc in docs:
words = set(doc.split())
for word in words:
idf_dict[word] = idf_dict.get(word, 0) + 1
for word in idf_dict:
idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
return idf_dict
# Stepn 3: compute TF-IDF
def compute_tfidf(tf, idf):
tfidf = {}
for word in tf:
tfidf[word] = tf[word] * idf.get(word, 0)
return tfidf
# Compute TF for each document
tf_list = [compute_tf(doc) for doc in docs]
# Compute IDF for all documents
idf = compute_idf(docs)
# Compute TF-IDF for each document
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]
# print the results
for i, tfidf in enumerate(tfidf_list):
print(f"TF-IDF for dokument {i+1}:")
for word, score in tfidf.items():
print(f" {word}: {score:.4f}")
import pandas as pd
# Send the TF-IDF results into a DataFrame
df = pd.DataFrame(tfidf_list).fillna(0)
print ('\n')
df
TF-IDF for dokument 1:
this: 0.0000
is: 0.0000
a: 0.2197
sample: 0.2197
document: 0.0000
TF-IDF for dokument 2:
this: 0.0000
document: 0.0000
is: 0.0000
another: 0.2197
example: 0.0811
TF-IDF for dokument 3:
this: 0.0000
example: 0.0811
document: 0.0000
is: 0.0000
different: 0.2197
this | is | a | sample | document | another | example | different | |
---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.219722 | 0.219722 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
1 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.219722 | 0.081093 | 0.000000 |
2 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.081093 | 0.219722 |