"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

January 01, 2020

Data Science Experiment - Milk Adulteration

Data - Link


import pandas as pd
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sys
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn import svm
from imblearn.over_sampling import SMOTE
verbose = False
ratio = 'auto'
print (sys.version)
input_file = "TrainDataBinaryClassification.xls"
df = pd.read_csv(input_file,header=0,sep=",")
print(df.head())
print(df.head(5))
#Remove insignificant id column
df.drop(['Id'],1,inplace=True)
#List all column headers
print(list(df))
#Fill missing values
df = df.fillna(-999)
features = list(df.columns[:-1])
print(features);
y1 = df['class']
x1 = df[features]
#Option 1
#SENN = SMOTEENN(ratio=ratio)
#x, y = SENN.fit_sample(x1, y1)
#Option #2
sm = SMOTE(kind='svm')
x, y = sm.fit_sample(x1, y1)
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3)
print('Shape of test data')
rf = RandomForestClassifier(n_estimators=350) # initialize
classifier2 = rf.fit(x, y) # fit the data to the algorithm
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3)
print('Shape of test data')
classifier = tree.DecisionTreeClassifier(criterion="entropy")
classifier = classifier.fit(x,y)
classifier3 = RandomForestClassifier(n_jobs=250)
classifier3 = classifier3.fit(x,y)
classifier2 = svm.SVC()
classifier2 = classifier2.fit(x,y)
clfs = [classifier, classifier2, classifier3]
clf = EnsembleVoteClassifier(clfs, voting='hard', weights = (4,4,5))
clf.fit(x, y)
input_file = "TestDataTwoClass.xls"
df = pd.read_csv(input_file,header=0,sep=",")
df2 = pd.read_csv(input_file,header=0,sep=",")
df.drop(['Id'],1,inplace=True)
df = df.fillna(-999)
x = df[features]
predictions = clf.predict(x)
print('predictions')
i = 0
for i in range(0,len(predictions)):
print(predictions[i])
df['class'] = predictions
df2['class'] = predictions
print('count',df['class'])
header = ["Id","class"]
df2.to_csv("Results_Binary_Class_Adulteration_Sep18_2.csv", sep=',', columns = header,index=False)
view raw Experiment1.py hosted with ❤ by GitHub
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import sys
sys.path.append('C:\\Anaconda2\\xgboost')
import xgboost as xgb
#2.7.12 |Anaconda 4.0.0 (64-bit)|
print (sys.version)
input_file = "TrainDataBinaryClassification.xls"
df = pd.read_csv(input_file,header=0,sep=",")
print(df.head())
print(df.head(5))
#Remove insignificant id column
df.drop(['Id'],1,inplace=True)
#List all column headers
print(list(df))
print(df.head())
#Fill missing values
df = df.fillna(-99)
features = list(df.columns[:-1])
print(features);
y = df['class']
x = df[features]
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3)
print('Shape of test data')
#classifier = tree.DecisionTreeClassifier(criterion="entropy")
#classifier = classifier.fit(pred_train,tar_train)
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x, y)
#print('acc', classifier.score(x,y))
#predictions = classifier.predict(pred_test)
#print(predictions)
#print(sklearn.metrics.confusion_matrix(tar_test,predictions))
#print('Classifier Accuracy')
#print(sklearn.metrics.accuracy_score(tar_test,predictions))
input_file = "TestDataTwoClassResults.xls"
df = pd.read_csv(input_file,header=0,sep=",")
df2 = pd.read_csv(input_file,header=0,sep=",")
df.drop(['Id'],1,inplace=True)
df = df.fillna(-99)
x = df[features]
predictions = gbm.predict(x)
print('predictions')
#print(predictions)
i = 0
for i in range(0,len(predictions)):
print(predictions[i])
#print('count',len(predictions))
df2['class'] = predictions
#df.to_csv("Results_Adulteration.csv", sep=',',index=False)
header = ["Id","class"]
df2.to_csv("Results_Adulteration_Sep15.csv", sep=',', columns = header,index=False)
view raw Experiment2.py hosted with ❤ by GitHub
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import sys
from sklearn.ensemble import RandomForestClassifier
#2.7.12 |Anaconda 4.0.0 (64-bit)|
print (sys.version)
input_file = "TrainDataMultiClassClassification_Custom_sep18.csv"
df = pd.read_csv(input_file,header=0,sep=",")
print(df.head())
print(df.head(5))
#Remove insignificant id column
df.drop(['Id'],1,inplace=True)
#List all column headers
print(list(df))
print(df.head())
#Fill missing values
df = df.fillna(-9999)
features = list(df.columns[:-1])
print(features);
y = df['class']
x = df[features]
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3)
print('Shape of test data')
classifier = tree.DecisionTreeClassifier(criterion="entropy")
classifier = classifier.fit(pred_train,tar_train)
print('acc', classifier.score(x,y))
predictions = classifier.predict(pred_test)
rf = RandomForestClassifier(n_estimators=300) # initialize
rf.fit(x, y) # fit the data to the algorithm
input_file = "TestDataMultiClass.xls"
df = pd.read_csv(input_file,header=0,sep=",")
df2 = pd.read_csv(input_file,header=0,sep=",")
df.drop(['Id'],1,inplace=True)
df = df.fillna(-9999)
x = df[features]
predictions = rf.predict(x)
#predictions = classifier.predict(x)
print('predictions')
#print(predictions)
i = 0
for i in range(0,len(predictions)):
print(predictions[i])
#print('count',len(predictions))
df['class'] = predictions
#print('count',len(predictions))
df2['class'] = predictions
print('count',df['class'])
#df.to_csv("Results_Multi_Class_Adulteration.csv", sep=',',index=False)
header = ["Id","class"]
df2.to_csv("Results_Multi_Class_Adulteration_2_Sep18_RF.csv", sep=',', columns = header,index=False)
view raw Experiment3.py hosted with ❤ by GitHub

Happy Learning!!!

No comments: