"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

October 15, 2021

Decision Trees - 5 Mins Tutorials

# -*- coding: utf-8 -*-
"""DecisionTrees.ipynb
Automatically generated by Colaboratory.
"""
#Gini Index has values inside the interval [0, 0.5] whereas the interval of the Entropy is [0, 1].
#Calculation of the Gini Index will be faster.
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
dataset = pd.read_csv(r'/content/diabetes_dataset.csv',sep= ',', header =0)
print(dataset.head())
#list all column names
print(dataset.columns.tolist())
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
#Drop the Outcome column and assign dataframe to X
x=dataset.drop(['Outcome'], axis=1)
#Assign outcome Y
y=dataset.Outcome
#Divide into train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
# Create Decision Tree classifer model
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=2)
# Map Data
model = model.fit(x_train,y_train)
#Predict the response for test dataset
y_pred = model.predict(x_test)
#Evaluation using Accuracy score
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)
#from sklearn import tree
#text_representation = tree.export_text(model)
#print(text_representation)
# Create Decision Tree classifer model
model = DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=10, min_samples_split=2)
# Map Data
model = model.fit(x_train,y_train)
#Predict the response for test dataset
y_pred = model.predict(x_test)
#Evaluation using Accuracy score
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)
#For Information gain purpose lets limit Pregnancy values = 0,1,2 only
#Total Samples = 349, Outcome = 0 is 263, Outcome = 1 is 86
#Pregnancy values 0, Outcome = 0, 73, Total Samples 111
#Pregnancy values 0, Outcome = 1, 38, Total Samples 111
#Pregnancy values 1, Outcome = 0, 106, Total Samples 135
#Pregnancy values 1, Outcome = 1, 29, Total Samples 135
#Pregnancy values 2, Outcome = 0, 84, Total Samples 103
#Pregnancy values 2, Outcome = 1, 19, Total Samples 103
#Total set
import math
A = ((263/349)*math.log((263/349),2))
B = ((86/349)*math.log((86/349),2))
TotalEntropy = -A-B
print('Entropy of Pregnancy column')
print(TotalEntropy)
#For Pregnancy 0,[73,38], 111
A = ((73/111)*math.log((73/111),2))
B = ((38/111)*math.log((38/111),2))
EntropyP0 = -A-B
print('Entropy of Pregnancy 0')
print(EntropyP0)
#For Pregnancy 1,[106,29],135
A = ((106/135)*math.log((106/135),2))
B = ((29/135)*math.log((29/135),2))
EntropyP1 = -A-B
print('Entropy of Pregnancy 1')
print(EntropyP1)
#For Pregnancy 2,[84,19],103
A = ((-84/103)*math.log((84/103),2))
B = ((19/103)*math.log((19/103),2))
EntropyP2 = -A-B
print('Entropy of Pregnancy 2')
print(EntropyP2)
Gain_Pfeature= TotalEntropy - (111/349)*EntropyP0 - (135/349)*EntropyP1 - (103/349)*EntropyP2
print('Gain on pregnancy feature')
print(Gain_Pfeature)



Happy Learning!!! 

No comments: