TF-IDF example#

Function for calculating the relative frequency (term frequency or tf)#

# Example documents
docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]

# Calulate tf
def compute_tf(doc):
    tf_dict = {}
    words = doc.split()
    total_words = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words
    return tf_dict

for doc in docs:
    print (compute_tf(doc))
{'this': 0.2, 'is': 0.2, 'a': 0.2, 'sample': 0.2, 'document': 0.2}
{'this': 0.2, 'document': 0.2, 'is': 0.2, 'another': 0.2, 'example': 0.2}
{'this': 0.2, 'example': 0.2, 'document': 0.2, 'is': 0.2, 'different': 0.2}

Function to calculate the inverse document frequency (inverse document frequency or idf)#

IDF is a measure of how important a word is to one document in a collection (or corpus) of documents. The intuition behind IDF is that words that appear in many documents are less informative than those that appear in fewer documents.

docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]


import math

# Calculate idf
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for doc in docs:
        words = set(doc.split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word in idf_dict:
        idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
    return idf_dict

print (compute_idf(docs))
{'document': 0.0, 'is': 0.0, 'sample': 1.0986122886681098, 'a': 1.0986122886681098, 'this': 0.0, 'example': 0.4054651081081644, 'another': 1.0986122886681098, 'different': 1.0986122886681098}

Functions to calculate tf-idf#

TF-IDF stands for Term Frequency-Inverse Document Frequency and is used to assess the importance of a word in a document in relation to a collection of documents (corpus).

docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]

import math

# Step 1: compute TF
def compute_tf(doc):
    tf_dict = {}
    words = doc.split()
    total_words = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words
    return tf_dict

# Step 2: compute IDF
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for doc in docs:
        words = set(doc.split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word in idf_dict:
        idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
    return idf_dict

# Stepn 3: compute TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {}
    for word in tf:
        tfidf[word] = tf[word] * idf.get(word, 0)
    return tfidf

# Compute TF for each document
tf_list = [compute_tf(doc) for doc in docs]

# Compute IDF for all documents
idf = compute_idf(docs)

# Compute TF-IDF for each document
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]

# print the results
for i, tfidf in enumerate(tfidf_list):
    print(f"TF-IDF for dokument {i+1}:")
    for word, score in tfidf.items():
        print(f"  {word}: {score:.4f}")

import pandas as pd
# Send the TF-IDF results into a DataFrame
df = pd.DataFrame(tfidf_list).fillna(0)
print ('\n')
df
TF-IDF for dokument 1:
  this: 0.0000
  is: 0.0000
  a: 0.2197
  sample: 0.2197
  document: 0.0000
TF-IDF for dokument 2:
  this: 0.0000
  document: 0.0000
  is: 0.0000
  another: 0.2197
  example: 0.0811
TF-IDF for dokument 3:
  this: 0.0000
  example: 0.0811
  document: 0.0000
  is: 0.0000
  different: 0.2197

this is a sample document another example different
0 0.0 0.0 0.219722 0.219722 0.0 0.000000 0.000000 0.000000
1 0.0 0.0 0.000000 0.000000 0.0 0.219722 0.081093 0.000000
2 0.0 0.0 0.000000 0.000000 0.0 0.000000 0.081093 0.219722