This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#NLP Notes and fundamentals | |
#What is one hot encoding | |
#Way to convert categorical data into numerical format | |
#One hot encoding demo code | |
#Code - Reference - https://github.com/jalajthanaki/NLPython/blob/master/ch5 | |
from __future__ import division | |
import pandas as pd | |
from sklearn.feature_extraction import DictVectorizer | |
df = pd.DataFrame([['Bachelors','EntryLevel','Male'],['Masters','Management','Female'],['Phd','Sales','Male']],columns=['Education','Segment','Gender']) | |
print(df) | |
print(pd.get_dummies(df)) | |
#using Sci-kit learn | |
v = DictVectorizer() | |
qualitative_features=['Segment'] | |
x_qual = v.fit_transform(df[qualitative_features].to_dict('records')) | |
print(v.vocabulary_) | |
print(x_qual.toarray()) | |
#What is ngrams | |
#For N = 1, This is a sentence | |
#Unigrams are - This, is, a , sentence | |
#For N = 2, This is a sentence | |
#bigrams are - This is, is a, a sentence | |
#For N = 3, This is a sentence | |
#Trigrams are - This is a, is a sentence | |
#ngrams Demo | |
from nltk import ngrams | |
sentence = 'test the words for multiple types of possible ngrams to generate' | |
#ngrams2 | |
print('ngram = 2') | |
ngramresults = ngrams(sentence.split(),2) | |
for data in ngramresults: | |
print(data) | |
#ngrams3 | |
print('ngram = 3') | |
ngramresults = ngrams(sentence.split(),3) | |
for data in ngramresults: | |
print(data) | |
#ngrams4 | |
print('ngram = 4') | |
ngramresults = ngrams(sentence.split(),4) | |
for data in ngramresults: | |
print(data) | |
#What is bow model | |
#BOW example | |
#Sentence - The dog is on the table | |
#Representation - are, cat, dog, is, now, on, the, table | |
#BOW representation - 0, 0, 1, 1, 0, 1, 1, 1 | |
from sklearn.feature_extraction.text import CountVectorizer | |
import numpy as np | |
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(2,2),min_df=1) | |
#list is number of document here there are two document and each has only one word | |
#we are considering ngram=2 | |
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present']) | |
ngram_vectorizer.get_feature_names() == (['document1','document2','this']) | |
print(counts.toarray().astype(int)) | |
ngram_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,1),min_df=1) | |
#list is number of document here there are two document and each has only one word | |
#we are considering ngram=1 | |
counts = ngram_vectorizer.fit_transform(['this is document1 movies data','this is document2 text data is present']) | |
ngram_vectorizer.get_feature_names() == (['document1','document2','this']) | |
print(counts.toarray().astype(int)) | |
#Term Frequency, Document Frequency, Inverse Document Frequency | |
#Term Frequency - Number of times term t appears in document d | |
#Document Frequency - Number of documents in collection the term appears | |
#Inverse Document Frequency - Log(N/Dft) - (Number of documents in collection / Number of documents term t appears) | |
#IDF = log[Total Docs / Docs contain the term] | |
#Lower IDF higher the occurences | |
from textblob import TextBlob | |
import math | |
def tf(word,blob): | |
return blob.words.count(word)/len(blob.words) | |
def n_containing(word,bloblist): | |
return 1+sum(1 for blob in bloblist if word in blob) | |
def idf(word,bloblist): | |
x = n_containing(word,bloblist) | |
return math.log(len(bloblist)/(x if x else 1)) | |
def tfidf(word,blob,bloblist): | |
return tf(word,blob)*idf(word,bloblist) | |
text1 = "term frequency document frequency tf idf" | |
text2 = "numeric stats intended to reflect the data format" | |
text3 = "data collection in corpus data" | |
blob1 = TextBlob(text1) | |
blob2 = TextBlob(text2) | |
blob3 = TextBlob(text3) | |
bloblist = [blob1,blob2,blob3] | |
tf_score = tf('frequency',blob1) | |
idf_score = idf('frequency',bloblist) | |
tfidf_score = tfidf('frequency',blob1,bloblist) | |
print('tf score is '+str(tf_score)) | |
print('idf score is '+str(idf_score)) | |
print('tfidf score is '+str(tfidf_score)) |
Happy Learning!!!
No comments:
Post a Comment