- Handling blank values
- Handling missing values
- Handling data imbalance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
#Define Data Frames | |
Data = { | |
'avgage': [22,38,26,35,22,' ',20], | |
'Collections':[5000,np.NaN,6000,np.NaN,np.NaN,'',4000], | |
'Category':[1,0,1,0,1,1,1] | |
} | |
Dataset = pd.DataFrame(Data) | |
Columnnames = Dataset.columns.tolist() | |
print(Columnnames) | |
print(Dataset.isnull().sum()) | |
print('Data Stats') | |
print('=================') | |
print(Dataset.info()) | |
#Missing Value Stats | |
print('Null Value Stats') | |
print('=================') | |
print(Dataset.isnull().sum(axis=0)) | |
#blank values along the column | |
print('Blank Values') | |
print('=================') | |
print((Dataset == '').sum(axis=0)) | |
#Replace Blank Values | |
DatsetBV=Dataset.fillna("NaN") | |
#Replace NaN Values | |
DatsetBV = DatsetBV.replace('', np.NaN) | |
DatsetBV = DatsetBV.replace(' ', np.NaN) | |
DatsetBV = DatsetBV.replace(np.NaN,'-999') | |
DatsetBV = DatsetBV.replace('NaN','-999') | |
print(DatsetBV) | |
#Data Imbalance | |
print('Stats') | |
print(DatsetBV['Category'].value_counts()) | |
#assign X and Y | |
y = DatsetBV.Category | |
x = DatsetBV.drop('Category',axis=1) | |
#seperate majority and minority class | |
df_majority = DatsetBV[DatsetBV.Category==1] | |
df_minority = DatsetBV[DatsetBV.Category==0] | |
from sklearn.utils import resample | |
#upsample minority class | |
df_minority_upsampled = resample(df_minority,replace=True #Sample with replacement | |
,n_samples=5, #to match majority class | |
random_state = 123) #reproducible results | |
#upsample minority class | |
df_majority_downsample = resample(df_majority,replace=False #Sample without replacement | |
,n_samples=2, #to match majority class | |
random_state = 123) #reproducible results | |
#combine upsampled and majority classes | |
df_balanced_class_option1 = pd.concat([df_majority,df_minority_upsampled]) | |
print('upsampled') | |
print(df_balanced_class_option1) | |
#combine downsampled and minority classes | |
df_balanced_class_option2 = pd.concat([df_minority,df_majority_downsample]) | |
print('downsampled') | |
print(df_balanced_class_option2) |
Happy Learning!!!
No comments:
Post a Comment