"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

March 03, 2020

Day #330 - Feature Selection Techniques

After a long time started reviewing Stanford ML Project reports. This report Feature Selection for predictive models is the study report for the day.


Code Examples
#Data Distribution Analysis
#Data Cleaning
#Missing Variables
#Feature Correlation
#Feature PCA
#Option 1 - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
#remove low variance features
#Intuitively it seems that low variance features are not useful and are just noise to the model.
#The variance of a feature ignores the relationship between the feature and the response
from sklearn import feature_selection as f
x = [[0,2,0,3],[0,1,4,3],[0,1,1,3]]
selector = f.VarianceThreshold(threshold=0.2)
print(selector.fit(x))
print(selector.transform(x))
#Collinear features are features that are highly correlated with one another.
#Irrelevant or partially relevant features can negatively impact model performance.
#https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/
from sklearn.feature_selection import SelectKBest,chi2
import pandas as pd
#https://github.com/jbrownlee/Datasets
#https://www.andreagrandi.it/2018/04/14/machine-learning-pima-indians-diabetes/
#attributenames
names = ['preg','plas','pres','skin','test','mass','predi','age','class']
dataset = pd.read_csv(r'E:\Learning_Days_2020\pima-indians-diabetes.csv',header=None)
dataset.columns = names
print(dataset.head())
y = dataset['class']
x = dataset.drop('class',axis=1)
print(x.shape)
print(y.shape)
#K - Number of top features to select.
X_new = SelectKBest(chi2, k=4).fit_transform(x, y)
print(X_new.shape)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_new,y,test_size=0.2,random_state=0)
#fitting Random Forest to training dataset
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=40,criterion='gini',random_state=0)
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)
from sklearn import metrics
print('SelectKBest - Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
#(768, 8)
#(768, 4)
#RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
#the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features
warnings.filterwarnings("ignore")
model = LogisticRegression()
rfe = RFE(model,3)
fit = rfe.fit(x,y)
print('Number of features %d'%fit.n_features_)
print('Selected Features %s'%fit.support_)
print('Feature Ranking %s'%fit.ranking_)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
#fitting Random Forest to training dataset
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=40,criterion='gini',random_state=0)
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)
from sklearn import metrics
print('RandomForestClassifier - Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
#PCA
#PCA uses linear algebra to transform the dataset into a compressed form.
#Generally, it is considered a data reduction technique
from sklearn.decomposition import PCA
pca = PCA(0.99)
x_new = x.iloc[:,:-1].values
x1 = pca.fit_transform(x_new)
print('Original')
print(x_new[1])
print('After PCA')
print(x1[1])
y1 = y.values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x1,y1,test_size=0.2,random_state=0)
#fitting Random Forest to training dataset
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=40,criterion='gini',random_state=0)
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)
from sklearn import metrics
print('PCA - Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')

Happy Learning!!!

No comments: