Data - Link
Happy Learning!!!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn import tree | |
from sklearn.cross_validation import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
import sys | |
from mlxtend.classifier import EnsembleVoteClassifier | |
from sklearn import svm | |
from imblearn.over_sampling import SMOTE | |
verbose = False | |
ratio = 'auto' | |
print (sys.version) | |
input_file = "TrainDataBinaryClassification.xls" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
print(df.head()) | |
print(df.head(5)) | |
#Remove insignificant id column | |
df.drop(['Id'],1,inplace=True) | |
#List all column headers | |
print(list(df)) | |
#Fill missing values | |
df = df.fillna(-999) | |
features = list(df.columns[:-1]) | |
print(features); | |
y1 = df['class'] | |
x1 = df[features] | |
#Option 1 | |
#SENN = SMOTEENN(ratio=ratio) | |
#x, y = SENN.fit_sample(x1, y1) | |
#Option #2 | |
sm = SMOTE(kind='svm') | |
x, y = sm.fit_sample(x1, y1) | |
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) | |
print('Shape of test data') | |
rf = RandomForestClassifier(n_estimators=350) # initialize | |
classifier2 = rf.fit(x, y) # fit the data to the algorithm | |
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) | |
print('Shape of test data') | |
classifier = tree.DecisionTreeClassifier(criterion="entropy") | |
classifier = classifier.fit(x,y) | |
classifier3 = RandomForestClassifier(n_jobs=250) | |
classifier3 = classifier3.fit(x,y) | |
classifier2 = svm.SVC() | |
classifier2 = classifier2.fit(x,y) | |
clfs = [classifier, classifier2, classifier3] | |
clf = EnsembleVoteClassifier(clfs, voting='hard', weights = (4,4,5)) | |
clf.fit(x, y) | |
input_file = "TestDataTwoClass.xls" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
df2 = pd.read_csv(input_file,header=0,sep=",") | |
df.drop(['Id'],1,inplace=True) | |
df = df.fillna(-999) | |
x = df[features] | |
predictions = clf.predict(x) | |
print('predictions') | |
i = 0 | |
for i in range(0,len(predictions)): | |
print(predictions[i]) | |
df['class'] = predictions | |
df2['class'] = predictions | |
print('count',df['class']) | |
header = ["Id","class"] | |
df2.to_csv("Results_Binary_Class_Adulteration_Sep18_2.csv", sep=',', columns = header,index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn import tree | |
from sklearn.cross_validation import train_test_split | |
import sklearn.metrics | |
import sys | |
sys.path.append('C:\\Anaconda2\\xgboost') | |
import xgboost as xgb | |
#2.7.12 |Anaconda 4.0.0 (64-bit)| | |
print (sys.version) | |
input_file = "TrainDataBinaryClassification.xls" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
print(df.head()) | |
print(df.head(5)) | |
#Remove insignificant id column | |
df.drop(['Id'],1,inplace=True) | |
#List all column headers | |
print(list(df)) | |
print(df.head()) | |
#Fill missing values | |
df = df.fillna(-99) | |
features = list(df.columns[:-1]) | |
print(features); | |
y = df['class'] | |
x = df[features] | |
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) | |
print('Shape of test data') | |
#classifier = tree.DecisionTreeClassifier(criterion="entropy") | |
#classifier = classifier.fit(pred_train,tar_train) | |
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x, y) | |
#print('acc', classifier.score(x,y)) | |
#predictions = classifier.predict(pred_test) | |
#print(predictions) | |
#print(sklearn.metrics.confusion_matrix(tar_test,predictions)) | |
#print('Classifier Accuracy') | |
#print(sklearn.metrics.accuracy_score(tar_test,predictions)) | |
input_file = "TestDataTwoClassResults.xls" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
df2 = pd.read_csv(input_file,header=0,sep=",") | |
df.drop(['Id'],1,inplace=True) | |
df = df.fillna(-99) | |
x = df[features] | |
predictions = gbm.predict(x) | |
print('predictions') | |
#print(predictions) | |
i = 0 | |
for i in range(0,len(predictions)): | |
print(predictions[i]) | |
#print('count',len(predictions)) | |
df2['class'] = predictions | |
#df.to_csv("Results_Adulteration.csv", sep=',',index=False) | |
header = ["Id","class"] | |
df2.to_csv("Results_Adulteration_Sep15.csv", sep=',', columns = header,index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn import tree | |
from sklearn.cross_validation import train_test_split | |
import sklearn.metrics | |
import sys | |
from sklearn.ensemble import RandomForestClassifier | |
#2.7.12 |Anaconda 4.0.0 (64-bit)| | |
print (sys.version) | |
input_file = "TrainDataMultiClassClassification_Custom_sep18.csv" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
print(df.head()) | |
print(df.head(5)) | |
#Remove insignificant id column | |
df.drop(['Id'],1,inplace=True) | |
#List all column headers | |
print(list(df)) | |
print(df.head()) | |
#Fill missing values | |
df = df.fillna(-9999) | |
features = list(df.columns[:-1]) | |
print(features); | |
y = df['class'] | |
x = df[features] | |
pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) | |
print('Shape of test data') | |
classifier = tree.DecisionTreeClassifier(criterion="entropy") | |
classifier = classifier.fit(pred_train,tar_train) | |
print('acc', classifier.score(x,y)) | |
predictions = classifier.predict(pred_test) | |
rf = RandomForestClassifier(n_estimators=300) # initialize | |
rf.fit(x, y) # fit the data to the algorithm | |
input_file = "TestDataMultiClass.xls" | |
df = pd.read_csv(input_file,header=0,sep=",") | |
df2 = pd.read_csv(input_file,header=0,sep=",") | |
df.drop(['Id'],1,inplace=True) | |
df = df.fillna(-9999) | |
x = df[features] | |
predictions = rf.predict(x) | |
#predictions = classifier.predict(x) | |
print('predictions') | |
#print(predictions) | |
i = 0 | |
for i in range(0,len(predictions)): | |
print(predictions[i]) | |
#print('count',len(predictions)) | |
df['class'] = predictions | |
#print('count',len(predictions)) | |
df2['class'] = predictions | |
print('count',df['class']) | |
#df.to_csv("Results_Multi_Class_Adulteration.csv", sep=',',index=False) | |
header = ["Id","class"] | |
df2.to_csv("Results_Multi_Class_Adulteration_2_Sep18_RF.csv", sep=',', columns = header,index=False) | |
Happy Learning!!!
No comments:
Post a Comment