"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

September 14, 2018

Day #128 - NLP Basics - Demo and Notes

Found some code snippets for quick reference. Adding code examples and Basics Concepts for NLP Learning kit

#NLP Notes and fundamentals
#What is one hot encoding
#Way to convert categorical data into numerical format
#One hot encoding demo code
#Code - Reference - https://github.com/jalajthanaki/NLPython/blob/master/ch5
from __future__ import division
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame([['Bachelors','EntryLevel','Male'],['Masters','Management','Female'],['Phd','Sales','Male']],columns=['Education','Segment','Gender'])
print(df)
print(pd.get_dummies(df))
#using Sci-kit learn
v = DictVectorizer()
qualitative_features=['Segment']
x_qual = v.fit_transform(df[qualitative_features].to_dict('records'))
print(v.vocabulary_)
print(x_qual.toarray())
#What is ngrams
#For N = 1, This is a sentence
#Unigrams are - This, is, a , sentence
#For N = 2, This is a sentence
#bigrams are - This is, is a, a sentence
#For N = 3, This is a sentence
#Trigrams are - This is a, is a sentence
#ngrams Demo
from nltk import ngrams
sentence = 'test the words for multiple types of possible ngrams to generate'
#ngrams2
print('ngram = 2')
ngramresults = ngrams(sentence.split(),2)
for data in ngramresults:
print(data)
#ngrams3
print('ngram = 3')
ngramresults = ngrams(sentence.split(),3)
for data in ngramresults:
print(data)
#ngrams4
print('ngram = 4')
ngramresults = ngrams(sentence.split(),4)
for data in ngramresults:
print(data)
#What is bow model
#BOW example
#Sentence - The dog is on the table
#Representation - are, cat, dog, is, now, on, the, table
#BOW representation - 0, 0, 1, 1, 0, 1, 1, 1
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(2,2),min_df=1)
#list is number of document here there are two document and each has only one word
#we are considering ngram=2
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present'])
ngram_vectorizer.get_feature_names() == (['document1','document2','this'])
print(counts.toarray().astype(int))
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,1),min_df=1)
#list is number of document here there are two document and each has only one word
#we are considering ngram=1
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present'])
ngram_vectorizer.get_feature_names() == (['document1','document2','this'])
print(counts.toarray().astype(int))
#Term Frequency, Document Frequency, Inverse Document Frequency
#Term Frequency - Number of times term t appears in document d
#Document Frequency - Number of documents in collection the term appears
#Inverse Document Frequency - Log(N/Dft) - (Number of documents in collection / Number of documents term t appears)
#IDF = log[Total Docs / Docs contain the term]
#Lower IDF higher the occurences
from textblob import TextBlob
import math
def tf(word,blob):
return blob.words.count(word)/len(blob.words)
def n_containing(word,bloblist):
return 1+sum(1 for blob in bloblist if word in blob)
def idf(word,bloblist):
x = n_containing(word,bloblist)
return math.log(len(bloblist)/(x if x else 1))
def tfidf(word,blob,bloblist):
return tf(word,blob)*idf(word,bloblist)
text1 = "term frequency document frequency tf idf"
text2 = "numeric stats intended to reflect the data format"
text3 = "data collection in corpus data"
blob1 = TextBlob(text1)
blob2 = TextBlob(text2)
blob3 = TextBlob(text3)
bloblist = [blob1,blob2,blob3]
tf_score = tf('frequency',blob1)
idf_score = idf('frequency',bloblist)
tfidf_score = tfidf('frequency',blob1,bloblist)
print('tf score is '+str(tf_score))
print('idf score is '+str(idf_score))
print('tfidf score is '+str(tfidf_score))
view raw NLPDemo.py hosted with ❤ by GitHub


Happy Learning!!!

No comments: