Source code for review_recommender.inverted_files

from __future__ import annotations
import math
from dataclasses import dataclass


@dataclass
class ItemReference:
    item: object
    length_squared: float

    def __hash__(self) -> int:
        return self.item.__hash__()

@dataclass
class TokenOccurence:
    item_ref: ItemReference
    count: int


@dataclass
class TokenInfo:
    idf: float
    occ_list: list[TokenOccurence]


[docs]class InvertedFile:
    """
    A class that models a basic inverted file, to witch you can add
    key value pairs of tokens to their count associated with an item
    (could be a document or anything else, the only condition is that
    it is hashable). 
    Can be queried to get similar items based on cosine similarity.
    """

    def __init__(self):
        self.token2Items: dict[str, TokenInfo] = {}
        self.totalItems = 0

[docs]    def add(self, item, token_freq: dict[str, int]):
        """
        Add an item with its associated token frequencies:

        Args:
            token_freq(dict[str, int]): pairs of token and their count
        """
        for token, count in token_freq.items():
            if not token in self.token2Items:
                self.token2Items[token] = TokenInfo(idf=0, occ_list=[])
            itemRef = ItemReference(item, length_squared=0)
            self.token2Items[token].occ_list.append(TokenOccurence(itemRef, count))
            self.totalItems += 1

    def calculateIDF(self):
        for tokenInfo in self.token2Items.values():
            tokenInfo.idf = math.log2(self.totalItems/len(tokenInfo.occ_list))
        
        for tokenInfo in self.token2Items.values():
            idf = tokenInfo.idf
            for tokenOccurrence in tokenInfo.occ_list:
                count = tokenOccurrence.count
                tokenOccurrence.item_ref.length_squared += (idf * count)**2

[docs]    def getSimilar(self, tokenFreqs):
        """
        Returns a  list of items that are similar to a query, that is 
        a new dictionary of token with their count.

        Args:
            tokenFreqs(dict[str, int]): pairs of token and their count
        """
        self.calculateIDF()
        retrievedRef2score: dict[ItemReference, float] = {}
        token2weights = {}
        for token, count in tokenFreqs.items():
            if not token in self.token2Items: continue
            tokenInfo = self.token2Items[token]
            idf = tokenInfo.idf
            weight = count * idf
            token2weights[token] = weight
            occList = tokenInfo.occ_list
            for occurrence in occList:
                itemRef = occurrence.item_ref
                countInItem = occurrence.count
                if not itemRef in retrievedRef2score: retrievedRef2score[itemRef] = 0
                retrievedRef2score[itemRef] += weight * idf * countInItem

        queryLengthSquared = 0
        for token, weight in token2weights.items():
            queryLengthSquared += weight**2
        queryLength = math.sqrt(queryLengthSquared)

        retrievedItem2score = {}
        for retrieved, score in retrievedRef2score.items():
            length = math.sqrt(retrieved.length_squared)
            retrievedItem2score[retrieved.item] = score/(queryLength * length)
        
        return retrievedItem2score
    
    def dump(self):
        print(self.token2Items.keys())