import re import porter from numpy import zeros,dot from numpy.linalg import norm __all__=['compare'] # import real stop words #stop_words = [ 'i', 'in', 'a', 'to', 'the', 'it', 'have', 'haven\'t', 'was', 'but', 'is', 'be', 'from' ] stop_words = [w.strip() for w in open('english.stop','r').readlines()] #print stop_words splitter=re.compile ( "[a-z\-']+", re.I ) stemmer=porter.PorterStemmer() def add_word(word,d): """ Adds a word the a dictionary for words/count first checks for stop words the converts word to stemmed version """ w=word.lower() if w not in stop_words: ws=stemmer.stem(w,0,len(w)-1) d.setdefault(ws,0) d[ws] += 1 def doc_vec(doc,key_idx): v=zeros(len(key_idx)) for word in splitter.findall(doc): keydata=key_idx.get(stemmer.stem(word,0,len(word)-1).lower(), None) if keydata: v[keydata[0]] = 1 return v def compare(doc1,doc2): # strip all punctuation but - and ' # convert to lower case # store word/occurance in dict all_words=dict() for dat in [doc1,doc2]: [add_word(w,all_words) for w in splitter.findall(dat)] # build an index of keys so that we know the word positions for the vector key_idx=dict() # key-> ( position, count ) keys=all_words.keys() keys.sort() #print keys for i in range(len(keys)): key_idx[keys[i]] = (i,all_words[keys[i]]) del keys del all_words v1=doc_vec(doc1,key_idx) v2=doc_vec(doc2,key_idx) return float(dot(v1,v2) / (norm(v1) * norm(v2))) if __name__ == '__main__': print "Running Test..." doc1="I like to eat chicken\nnoodle soup." doc2="I have read the book \"Chicken noodle soup for the soul\"." print "Using Doc1: %s\n\nUsing Doc2: %s\n" % ( doc1, doc2 ) print "Similarity %s" % compare(doc1,doc2)