"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

January 15, 2020

Day #320 - Preprocessing Examples


#https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
data = {'ColA':[10,20,30],'COlB':[5,'',10],'ColC':['A','B','C'],'ColD':[1,2,3],'ColE':[10,40,50],'BinCol':['A','B','A']}
import pandas as pd
import numpy as np
#Syntax Capital D and F
dataset = pd.DataFrame(data)
print(dataset.head())
def ReplaceMissingValues():
colnames = list(dataset.columns)
#Column Names
print(colnames)
#Null Stats, All functions
print(dataset.isnull().sum())
#value empty
#Usually axis=0 is said to be "column-wise" (and axis=1 "row-wise")
print((dataset=='').sum(axis=0))
datasetstd = dataset.replace('',np.NaN)
print(datasetstd)
datasetstd = datasetstd.replace(np.NaN,-999)
return datasetstd
from sklearn import preprocessing
def HandleCategoryValues():
#drop a column in a data frame
datasetstd = ReplaceMissingValues()
catcol= pd.get_dummies(dataset['ColC'])
print(catcol)
datasetstd = datasetstd.drop(['ColC'],axis=1)
print(datasetstd)
frames = [datasetstd,catcol]
#Columns based merge
result = pd.concat(frames,axis=1)
print(result)
rawdata = dataset[['BinCol']]
ohe = preprocessing.OneHotEncoder()
# OneHotEncoder, which transforms each categorical feature with n_categories possible values into n_categories binary features
print('OneHotEncoder')
print(rawdata)
print(ohe.fit_transform(rawdata))
ode = preprocessing.OrdinalEncoder()
print('ordinal encoder')
print(ode.fit_transform(rawdata))
def StandardizeNumericalData():
#min_max_scaler
min_max_scaler = preprocessing.MinMaxScaler()
rawdata = dataset[['ColD','ColE']]
print(rawdata)
print('min_max_scaler')
print(min_max_scaler.fit_transform(rawdata))
#standard scaler
scaler = preprocessing.StandardScaler()
print('scaler')
print(scaler.fit_transform(rawdata))
#max_abs_scaler
max_abs_scaler = preprocessing.MaxAbsScaler()
print('max_abs_scaler')
print(max_abs_scaler.fit_transform(rawdata))
HandleCategoryValues()
#StandardizeData()
import pandas as pd
import math
#Define Data Frames
data = {'name': ['Raj', 'Siva', 'Mike', 'Dan','New_Joinee'],
'age': [22,38,26,35,22],
'location':['Chennai',math.nan,'Bengaluru','Chennai',math.nan]}
dframe = pd.DataFrame(data)
print(dframe)
print('Missing Data Stats')
print(dframe.isna().sum())
#Option 1 - Fill with most frequent value
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
newdata = imputer.fit_transform(dframe)
print(newdata)
imputer = SimpleImputer(strategy='constant',fill_value='missing')
newdata = imputer.fit_transform(dframe)
print(newdata)
import numpy as np
x = np.array([0,1,2,3,4,5,6,7,8,9])
y = np.array([0,1,0,0,1,0,0,0,1,0])
#Fill with same proportion
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y)
print(y_train)
print(y_test)
#https://twitter.com/justmarkham/status/1244986650410786817/photo/1
data = {'name': ['Raj', 'Siva', 'Mike', 'Dan','New_Joinee'],
'gender': ['M','M','M','M','F'],
'location':['Chennai',math.nan,'Bengaluru','Chennai',math.nan]}
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder()
dframe = pd.DataFrame(data)
ct = make_column_transformer((ohe,['gender','location']),remainder='passthrough')
print(ct)
Happy Learning!!!

No comments: