|
# -*- coding: utf-8 -*- |
|
"""DecisionTrees.ipynb |
|
Automatically generated by Colaboratory. |
|
""" |
|
|
|
#Gini Index has values inside the interval [0, 0.5] whereas the interval of the Entropy is [0, 1]. |
|
#Calculation of the Gini Index will be faster. |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.metrics import confusion_matrix |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.metrics import accuracy_score |
|
from sklearn.metrics import classification_report |
|
|
|
dataset = pd.read_csv(r'/content/diabetes_dataset.csv',sep= ',', header =0) |
|
print(dataset.head()) |
|
|
|
#list all column names |
|
print(dataset.columns.tolist()) |
|
|
|
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier |
|
from sklearn.model_selection import train_test_split # Import train_test_split function |
|
#Drop the Outcome column and assign dataframe to X |
|
x=dataset.drop(['Outcome'], axis=1) |
|
#Assign outcome Y |
|
y=dataset.Outcome |
|
|
|
#Divide into train test split |
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) |
|
|
|
# Create Decision Tree classifer model |
|
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=2) |
|
# Map Data |
|
model = model.fit(x_train,y_train) |
|
#Predict the response for test dataset |
|
y_pred = model.predict(x_test) |
|
|
|
#Evaluation using Accuracy score |
|
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation |
|
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100) |
|
|
|
#from sklearn import tree |
|
#text_representation = tree.export_text(model) |
|
#print(text_representation) |
|
|
|
# Create Decision Tree classifer model |
|
model = DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=10, min_samples_split=2) |
|
# Map Data |
|
model = model.fit(x_train,y_train) |
|
#Predict the response for test dataset |
|
y_pred = model.predict(x_test) |
|
|
|
#Evaluation using Accuracy score |
|
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation |
|
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100) |
|
|
|
#For Information gain purpose lets limit Pregnancy values = 0,1,2 only |
|
#Total Samples = 349, Outcome = 0 is 263, Outcome = 1 is 86 |
|
#Pregnancy values 0, Outcome = 0, 73, Total Samples 111 |
|
#Pregnancy values 0, Outcome = 1, 38, Total Samples 111 |
|
#Pregnancy values 1, Outcome = 0, 106, Total Samples 135 |
|
#Pregnancy values 1, Outcome = 1, 29, Total Samples 135 |
|
#Pregnancy values 2, Outcome = 0, 84, Total Samples 103 |
|
#Pregnancy values 2, Outcome = 1, 19, Total Samples 103 |
|
|
|
#Total set |
|
import math |
|
A = ((263/349)*math.log((263/349),2)) |
|
B = ((86/349)*math.log((86/349),2)) |
|
TotalEntropy = -A-B |
|
print('Entropy of Pregnancy column') |
|
print(TotalEntropy) |
|
|
|
#For Pregnancy 0,[73,38], 111 |
|
A = ((73/111)*math.log((73/111),2)) |
|
B = ((38/111)*math.log((38/111),2)) |
|
EntropyP0 = -A-B |
|
print('Entropy of Pregnancy 0') |
|
print(EntropyP0) |
|
|
|
#For Pregnancy 1,[106,29],135 |
|
A = ((106/135)*math.log((106/135),2)) |
|
B = ((29/135)*math.log((29/135),2)) |
|
EntropyP1 = -A-B |
|
print('Entropy of Pregnancy 1') |
|
print(EntropyP1) |
|
|
|
#For Pregnancy 2,[84,19],103 |
|
A = ((-84/103)*math.log((84/103),2)) |
|
B = ((19/103)*math.log((19/103),2)) |
|
EntropyP2 = -A-B |
|
print('Entropy of Pregnancy 2') |
|
print(EntropyP2) |
|
|
|
Gain_Pfeature= TotalEntropy - (111/349)*EntropyP0 - (135/349)*EntropyP1 - (103/349)*EntropyP2 |
|
print('Gain on pregnancy feature') |
|
print(Gain_Pfeature) |
|
|