This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""DecisionTrees.ipynb | |
Automatically generated by Colaboratory. | |
""" | |
#Gini Index has values inside the interval [0, 0.5] whereas the interval of the Entropy is [0, 1]. | |
#Calculation of the Gini Index will be faster. | |
import numpy as np | |
import pandas as pd | |
from sklearn.metrics import confusion_matrix | |
from sklearn.model_selection import train_test_split | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import classification_report | |
dataset = pd.read_csv(r'/content/diabetes_dataset.csv',sep= ',', header =0) | |
print(dataset.head()) | |
#list all column names | |
print(dataset.columns.tolist()) | |
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier | |
from sklearn.model_selection import train_test_split # Import train_test_split function | |
#Drop the Outcome column and assign dataframe to X | |
x=dataset.drop(['Outcome'], axis=1) | |
#Assign outcome Y | |
y=dataset.Outcome | |
#Divide into train test split | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) | |
# Create Decision Tree classifer model | |
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=2) | |
# Map Data | |
model = model.fit(x_train,y_train) | |
#Predict the response for test dataset | |
y_pred = model.predict(x_test) | |
#Evaluation using Accuracy score | |
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation | |
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100) | |
#from sklearn import tree | |
#text_representation = tree.export_text(model) | |
#print(text_representation) | |
# Create Decision Tree classifer model | |
model = DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=10, min_samples_split=2) | |
# Map Data | |
model = model.fit(x_train,y_train) | |
#Predict the response for test dataset | |
y_pred = model.predict(x_test) | |
#Evaluation using Accuracy score | |
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation | |
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100) | |
#For Information gain purpose lets limit Pregnancy values = 0,1,2 only | |
#Total Samples = 349, Outcome = 0 is 263, Outcome = 1 is 86 | |
#Pregnancy values 0, Outcome = 0, 73, Total Samples 111 | |
#Pregnancy values 0, Outcome = 1, 38, Total Samples 111 | |
#Pregnancy values 1, Outcome = 0, 106, Total Samples 135 | |
#Pregnancy values 1, Outcome = 1, 29, Total Samples 135 | |
#Pregnancy values 2, Outcome = 0, 84, Total Samples 103 | |
#Pregnancy values 2, Outcome = 1, 19, Total Samples 103 | |
#Total set | |
import math | |
A = ((263/349)*math.log((263/349),2)) | |
B = ((86/349)*math.log((86/349),2)) | |
TotalEntropy = -A-B | |
print('Entropy of Pregnancy column') | |
print(TotalEntropy) | |
#For Pregnancy 0,[73,38], 111 | |
A = ((73/111)*math.log((73/111),2)) | |
B = ((38/111)*math.log((38/111),2)) | |
EntropyP0 = -A-B | |
print('Entropy of Pregnancy 0') | |
print(EntropyP0) | |
#For Pregnancy 1,[106,29],135 | |
A = ((106/135)*math.log((106/135),2)) | |
B = ((29/135)*math.log((29/135),2)) | |
EntropyP1 = -A-B | |
print('Entropy of Pregnancy 1') | |
print(EntropyP1) | |
#For Pregnancy 2,[84,19],103 | |
A = ((-84/103)*math.log((84/103),2)) | |
B = ((19/103)*math.log((19/103),2)) | |
EntropyP2 = -A-B | |
print('Entropy of Pregnancy 2') | |
print(EntropyP2) | |
Gain_Pfeature= TotalEntropy - (111/349)*EntropyP0 - (135/349)*EntropyP1 - (103/349)*EntropyP2 | |
print('Gain on pregnancy feature') | |
print(Gain_Pfeature) | |
Happy Learning!!!
No comments:
Post a Comment