|
import nltk |
|
from nltk.corpus import stopwords |
|
nltk.download('stopwords') |
|
stopwords = stopwords.words('english') |
|
|
|
#Top 5 demos with score > 4 for olay beauty cream |
|
#https://www.amazon.in/Olay-Total-Effects-Normal-renewal/dp/B00ENZT4M8 |
|
|
|
doc_a = "As the human skin comes in contact with dry, acidic and polluted air it gets affected in multiple ways that harm its texture, freshness and glow, so maintain all that healthy skin this cream is helpful and, in some cases, even necessary.." |
|
doc_b = "I use the night Firming Facial after the citrus scrub. This is a wonderful feeling cream. It is not greasy. I believe Olay Products are the best I have ever bought. I was so exhausted with buying expensive creams. They ended up being very heavy and greasy. I have used the products for about 3 weeks. I do see a difference in my face. So happy!!." |
|
doc_c = "I love Olay products. I've been using Olay since i turned 29. I am now 42 and my skin looks pretty good. Not too many wrinkles. With that said... I tried the Total effects night firming cream and i don't feel it is as good for my 42 year old skin as the Olay Pro-X Hydra firming cream. I feel this Total effects product would be great for later 20's to mid 30's. Just my opinion (and how my skin is). But i have been using the brand for years and Olay is an excellent product." |
|
doc_d = "Olay products have been my favorite from a long time. I like this one too as it works great as moisturizer plus it has spf too. It like its texture and it feels very light on skin. It absorbs into skin so quickly. I would definitely recommend this Olay cream to everyone who is looking for a good day cream which helps in reducing ageing signs." |
|
doc_e = "This product is really good. I have been using this since couple of months and can see visible changes in my skin. It gives enough moisture and reduced my acne to some extent.My skin feels smooth.I am very happy with this product.." |
|
|
|
# compile sample documents into a list |
|
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e] |
|
|
|
#tokenize |
|
from nltk.tokenize import RegexpTokenizer |
|
from nltk.corpus import stopwords |
|
#from stop_words import get_stop_words |
|
from nltk.stem.porter import PorterStemmer |
|
from gensim import corpora, models |
|
import gensim |
|
|
|
#match any word until it reaches not word character like space |
|
tokenizer = RegexpTokenizer(r'\w+') |
|
|
|
#create stop words list |
|
en_stop = stopwords.words('english') |
|
|
|
#create stemmer |
|
p_stemmer = PorterStemmer() |
|
|
|
#list of tokenized documents |
|
texts = [] |
|
|
|
#loop through list |
|
for i in doc_set: |
|
#clean and tokenize document string |
|
raw = i.lower() |
|
tokens = tokenizer.tokenize(raw) |
|
|
|
#remove stop words from tokens |
|
stopped_tokens = [i for i in tokens if not i in en_stop] |
|
|
|
#stem tokens |
|
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] |
|
|
|
#add tokens to list |
|
texts.append(stemmed_tokens) |
|
|
|
#turn tokenized documents into history |
|
dictionary = corpora.Dictionary(texts) |
|
|
|
print(dictionary) |
|
|
|
#convert into document term matrix |
|
corpus = [dictionary.doc2bow(text) for text in texts] |
|
#print(corpus) |
|
#Next, we will get the word ids and their frequencies in our documents. |
|
for doc in corpus: |
|
print([[dictionary[id], freq] for id, freq in doc]) |
|
|
|
import numpy as np |
|
tfidf = models.TfidfModel(corpus,normalize=True) |
|
corpus_tfidf = tfidf[corpus] |
|
print(tfidf) |
|
for doc in tfidf[corpus]: |
|
print([[dictionary[id], np.round(freq,2)] for id, freq in doc]) |
|
|
|
#LDA expects the input in the form of integers and tf-idf accomplishes that with ease. |
|
#Create a dictionary of processed set of tokens after removing the stop words, lemmatizing/stemming the tokens, removing any html tags as such as per the cas |
|
|
|
#generate lda model |
|
#ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=5,id2word=dictionary,passes=80) |
|
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf,num_topics=5,id2word=dictionary,passes=80) |
|
print(ldamodel.show_topics(num_topics=5, num_words=5, log=False, formatted=True)) |
|
|
|
print('LDA Model') |
|
for i in range(0, ldamodel.num_topics-1): |
|
print('Topic',i) |
|
print(ldamodel.print_topic(i)) |
|
print(ldamodel.show_topic(i,topn=20)) |
|
#print(ldamodel.show_topics(num_topics=10, num_words=10, log=False, formatted=True)) |
|
|
|
|
|
print('LSI Model') |
|
#lsimodel = models.LsiModel(corpus,num_topics=5,id2word=dictionary) |
|
lsimodel = models.LsiModel(corpus_tfidf,num_topics=5,id2word=dictionary) |
|
for i in range(0, lsimodel.num_topics-1): |
|
print('Topic',i) |
|
print(lsimodel.print_topic(i)) |
|
|
|
|