|
#NLP Notes and fundamentals |
|
|
|
#What is one hot encoding |
|
#Way to convert categorical data into numerical format |
|
#One hot encoding demo code |
|
#Code - Reference - https://github.com/jalajthanaki/NLPython/blob/master/ch5 |
|
from __future__ import division |
|
import pandas as pd |
|
from sklearn.feature_extraction import DictVectorizer |
|
|
|
df = pd.DataFrame([['Bachelors','EntryLevel','Male'],['Masters','Management','Female'],['Phd','Sales','Male']],columns=['Education','Segment','Gender']) |
|
print(df) |
|
|
|
print(pd.get_dummies(df)) |
|
|
|
#using Sci-kit learn |
|
v = DictVectorizer() |
|
qualitative_features=['Segment'] |
|
x_qual = v.fit_transform(df[qualitative_features].to_dict('records')) |
|
print(v.vocabulary_) |
|
print(x_qual.toarray()) |
|
|
|
#What is ngrams |
|
#For N = 1, This is a sentence |
|
#Unigrams are - This, is, a , sentence |
|
|
|
#For N = 2, This is a sentence |
|
#bigrams are - This is, is a, a sentence |
|
|
|
#For N = 3, This is a sentence |
|
#Trigrams are - This is a, is a sentence |
|
|
|
#ngrams Demo |
|
from nltk import ngrams |
|
sentence = 'test the words for multiple types of possible ngrams to generate' |
|
|
|
#ngrams2 |
|
print('ngram = 2') |
|
ngramresults = ngrams(sentence.split(),2) |
|
for data in ngramresults: |
|
print(data) |
|
|
|
#ngrams3 |
|
print('ngram = 3') |
|
ngramresults = ngrams(sentence.split(),3) |
|
for data in ngramresults: |
|
print(data) |
|
|
|
#ngrams4 |
|
print('ngram = 4') |
|
ngramresults = ngrams(sentence.split(),4) |
|
for data in ngramresults: |
|
print(data) |
|
|
|
#What is bow model |
|
#BOW example |
|
#Sentence - The dog is on the table |
|
#Representation - are, cat, dog, is, now, on, the, table |
|
#BOW representation - 0, 0, 1, 1, 0, 1, 1, 1 |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import numpy as np |
|
|
|
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(2,2),min_df=1) |
|
#list is number of document here there are two document and each has only one word |
|
#we are considering ngram=2 |
|
|
|
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present']) |
|
ngram_vectorizer.get_feature_names() == (['document1','document2','this']) |
|
|
|
print(counts.toarray().astype(int)) |
|
|
|
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,1),min_df=1) |
|
#list is number of document here there are two document and each has only one word |
|
#we are considering ngram=1 |
|
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present']) |
|
ngram_vectorizer.get_feature_names() == (['document1','document2','this']) |
|
|
|
print(counts.toarray().astype(int)) |
|
|
|
#Term Frequency, Document Frequency, Inverse Document Frequency |
|
#Term Frequency - Number of times term t appears in document d |
|
#Document Frequency - Number of documents in collection the term appears |
|
#Inverse Document Frequency - Log(N/Dft) - (Number of documents in collection / Number of documents term t appears) |
|
#IDF = log[Total Docs / Docs contain the term] |
|
#Lower IDF higher the occurences |
|
|
|
|
|
from textblob import TextBlob |
|
import math |
|
|
|
def tf(word,blob): |
|
return blob.words.count(word)/len(blob.words) |
|
|
|
def n_containing(word,bloblist): |
|
return 1+sum(1 for blob in bloblist if word in blob) |
|
|
|
def idf(word,bloblist): |
|
x = n_containing(word,bloblist) |
|
return math.log(len(bloblist)/(x if x else 1)) |
|
|
|
def tfidf(word,blob,bloblist): |
|
return tf(word,blob)*idf(word,bloblist) |
|
|
|
text1 = "term frequency document frequency tf idf" |
|
text2 = "numeric stats intended to reflect the data format" |
|
text3 = "data collection in corpus data" |
|
|
|
blob1 = TextBlob(text1) |
|
blob2 = TextBlob(text2) |
|
blob3 = TextBlob(text3) |
|
|
|
bloblist = [blob1,blob2,blob3] |
|
tf_score = tf('frequency',blob1) |
|
idf_score = idf('frequency',bloblist) |
|
tfidf_score = tfidf('frequency',blob1,bloblist) |
|
|
|
print('tf score is '+str(tf_score)) |
|
print('idf score is '+str(idf_score)) |
|
print('tfidf score is '+str(tfidf_score)) |