This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing | |
data = {'ColA':[10,20,30],'COlB':[5,'',10],'ColC':['A','B','C'],'ColD':[1,2,3],'ColE':[10,40,50],'BinCol':['A','B','A']} | |
import pandas as pd | |
import numpy as np | |
#Syntax Capital D and F | |
dataset = pd.DataFrame(data) | |
print(dataset.head()) | |
def ReplaceMissingValues(): | |
colnames = list(dataset.columns) | |
#Column Names | |
print(colnames) | |
#Null Stats, All functions | |
print(dataset.isnull().sum()) | |
#value empty | |
#Usually axis=0 is said to be "column-wise" (and axis=1 "row-wise") | |
print((dataset=='').sum(axis=0)) | |
datasetstd = dataset.replace('',np.NaN) | |
print(datasetstd) | |
datasetstd = datasetstd.replace(np.NaN,-999) | |
return datasetstd | |
from sklearn import preprocessing | |
def HandleCategoryValues(): | |
#drop a column in a data frame | |
datasetstd = ReplaceMissingValues() | |
catcol= pd.get_dummies(dataset['ColC']) | |
print(catcol) | |
datasetstd = datasetstd.drop(['ColC'],axis=1) | |
print(datasetstd) | |
frames = [datasetstd,catcol] | |
#Columns based merge | |
result = pd.concat(frames,axis=1) | |
print(result) | |
rawdata = dataset[['BinCol']] | |
ohe = preprocessing.OneHotEncoder() | |
# OneHotEncoder, which transforms each categorical feature with n_categories possible values into n_categories binary features | |
print('OneHotEncoder') | |
print(rawdata) | |
print(ohe.fit_transform(rawdata)) | |
ode = preprocessing.OrdinalEncoder() | |
print('ordinal encoder') | |
print(ode.fit_transform(rawdata)) | |
def StandardizeNumericalData(): | |
#min_max_scaler | |
min_max_scaler = preprocessing.MinMaxScaler() | |
rawdata = dataset[['ColD','ColE']] | |
print(rawdata) | |
print('min_max_scaler') | |
print(min_max_scaler.fit_transform(rawdata)) | |
#standard scaler | |
scaler = preprocessing.StandardScaler() | |
print('scaler') | |
print(scaler.fit_transform(rawdata)) | |
#max_abs_scaler | |
max_abs_scaler = preprocessing.MaxAbsScaler() | |
print('max_abs_scaler') | |
print(max_abs_scaler.fit_transform(rawdata)) | |
HandleCategoryValues() | |
#StandardizeData() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import math | |
#Define Data Frames | |
data = {'name': ['Raj', 'Siva', 'Mike', 'Dan','New_Joinee'], | |
'age': [22,38,26,35,22], | |
'location':['Chennai',math.nan,'Bengaluru','Chennai',math.nan]} | |
dframe = pd.DataFrame(data) | |
print(dframe) | |
print('Missing Data Stats') | |
print(dframe.isna().sum()) | |
#Option 1 - Fill with most frequent value | |
from sklearn.impute import SimpleImputer | |
imputer = SimpleImputer(strategy='most_frequent') | |
newdata = imputer.fit_transform(dframe) | |
print(newdata) | |
imputer = SimpleImputer(strategy='constant',fill_value='missing') | |
newdata = imputer.fit_transform(dframe) | |
print(newdata) | |
import numpy as np | |
x = np.array([0,1,2,3,4,5,6,7,8,9]) | |
y = np.array([0,1,0,0,1,0,0,0,1,0]) | |
#Fill with same proportion | |
from sklearn.model_selection import train_test_split | |
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y) | |
print(y_train) | |
print(y_test) | |
#https://twitter.com/justmarkham/status/1244986650410786817/photo/1 | |
data = {'name': ['Raj', 'Siva', 'Mike', 'Dan','New_Joinee'], | |
'gender': ['M','M','M','M','F'], | |
'location':['Chennai',math.nan,'Bengaluru','Chennai',math.nan]} | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.pipeline import make_pipeline | |
from sklearn.compose import make_column_transformer | |
ohe = OneHotEncoder() | |
dframe = pd.DataFrame(data) | |
ct = make_column_transformer((ohe,['gender','location']),remainder='passthrough') | |
print(ct) | |
No comments:
Post a Comment