What I like in my Approach
Happy Learning!!!
- R Example with
- Data Normalization
- Cluster Data
- Build Regression on top of Clustered Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://www3.dsi.uminho.pt/pcortez/forestfires/ | |
#Step 1 - Load Data and check summary details | |
setwd("E:/RNotes/RData/") | |
forestfires = NULL | |
forestfires <- read.csv(file = "forestfires.csv") | |
head(forestfires) | |
summary(forestfires) | |
#Step 2 - Visual Correlation between variables | |
#library(psych) | |
#pairs.panels(forestfires) | |
#Step 3 - Normalize columns where min and max values have large differences | |
#forestfires$DC <- log(forestfires$DC) | |
#forestfires$DMC <- log(forestfires$DMC) | |
forestfires = subset(forestfires, forestfires$area>0) | |
nrow(forestfires) | |
#forestfires$area <- (forestfires$area-min(forestfires$area))/(max(forestfires$area)-min(forestfires$area)) | |
forestfires$area <- log(forestfires$area) | |
forestfires$X = (forestfires$X-min(forestfires$X))/(max(forestfires$X)-min(forestfires$X)) | |
forestfires$Y = (forestfires$Y-min(forestfires$Y))/(max(forestfires$Y)-min(forestfires$Y)) | |
forestfires$XY = forestfires$X*forestfires$Y | |
forestfires$XXYY = forestfires$X*forestfires$X*forestfires$Y*forestfires$Y | |
forestfires$XY = (forestfires$XY-min(forestfires$XY))/(max(forestfires$XY)-min(forestfires$XY)) | |
forestfires$XXYY = (forestfires$XXYY-min(forestfires$XXYY))/(max(forestfires$XXYY)-min(forestfires$XXYY)) | |
forestfires$FFMC = (forestfires$FFMC-min(forestfires$FFMC))/(max(forestfires$FFMC)-min(forestfires$FFMC)) | |
#forestfires$DC <- log(forestfires$DC) | |
#forestfires$DC <- forestfires$DC | |
#forestfires$DMC <- forestfires$DMC | |
forestfires$DMCDC <- forestfires$DMC*forestfires$DC | |
forestfires$tempwind <- forestfires$temp*forestfires$wind | |
forestfires$FFMCDMCDC <- forestfires$FFMC*forestfires$DMC*forestfires$DC | |
#print(unique(forestfires$month)) | |
#print(unique(forestfires$day)) | |
sample(forestfires) | |
#Step 4 - Replace month and data text to numeric values as below | |
forestfires$month1[forestfires$month=='jan'] = 1 | |
forestfires$month1[forestfires$month=='feb'] = 2 | |
forestfires$month1[forestfires$month=='mar'] = 3 | |
forestfires$month1[forestfires$month=='apr'] = 4 | |
forestfires$month1[forestfires$month=='may'] = 5 | |
forestfires$month1[forestfires$month=='jun'] = 6 | |
forestfires$month1[forestfires$month=='jul'] = 7 | |
forestfires$month1[forestfires$month=='aug'] = 8 | |
forestfires$month1[forestfires$month=='sep'] = 9 | |
forestfires$month1[forestfires$month=='oct'] = 10 | |
forestfires$month1[forestfires$month=='nov'] = 11 | |
forestfires$month1[forestfires$month=='dec'] = 12 | |
forestfires$day1[forestfires$day=='sun'] = 0 | |
forestfires$day1[forestfires$day=='mon'] = 1 | |
forestfires$day1[forestfires$day=='tue'] = 2 | |
forestfires$day1[forestfires$day=='wed'] = 3 | |
forestfires$day1[forestfires$day=='thu'] = 4 | |
forestfires$day1[forestfires$day=='fri'] = 5 | |
forestfires$day1[forestfires$day=='sat'] = 6 | |
#Step 5 - Remove original columns | |
forestfires$day <- NULL | |
forestfires$month <- NULL | |
#Step 6 - Run K means | |
library(NbClust) | |
set.seed(1) | |
numberofclusters <- NbClust(forestfires,min.nc=2,max.nc=15,method="kmeans") | |
#Step 7 - Print results | |
table(numberofclusters$Best.n[1,]) | |
grpForest <- kmeans( forestfires, centers=3,nstart=10) | |
grpForest$cluster | |
grpForest$centers | |
grpForest$withinss | |
grpForest$size | |
forestfirescluster1 = NULL | |
forestfirescluster2 = NULL | |
forestfirescluster3 = NULL | |
clusterresults = c(grpForest$cluster) | |
length(clusterresults) | |
nrow(forestfires) | |
for(i in 1:nrow(forestfires)) | |
{ | |
if(clusterresults[i]==1) | |
{ | |
print('Cluster 1') | |
deldata = forestfires[i,] | |
forestfirescluster1 = rbind(forestfirescluster1,deldata) | |
} | |
if(clusterresults[i]==2) | |
{ | |
print('Cluster 2') | |
deldata = forestfires[i,] | |
forestfirescluster2 = rbind(forestfirescluster2,deldata) | |
} | |
if(clusterresults[i]==3) | |
{ | |
print('Cluster 3') | |
deldata = forestfires[i,] | |
forestfirescluster3 = rbind(forestfirescluster3,deldata) | |
} | |
} | |
nrow(forestfirescluster1) | |
nrow(forestfirescluster2) | |
nrow(forestfirescluster3) | |
#df2<-forestfirescluster1[complete.cases(forestfirescluster1),] | |
#forestfirescluster1 = NULL | |
#forestfirescluster1 = df2 | |
#df2<-forestfirescluster3[complete.cases(forestfirescluster3),] | |
#forestfirescluster3 = NULL | |
#forestfirescluster3 = df2 | |
nrow(forestfirescluster1) | |
nrow(forestfirescluster2) | |
nrow(forestfirescluster3) | |
model1 = lm(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude") | |
model1 | |
summary(model1) | |
plot(model1$fitted.values, model1$residual.values) | |
hist(model1$residuals) | |
library(leaps) | |
model1 = regsubsets(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude",data=forestfirescluster1,nbest = 8) | |
model1 | |
summary(model1) | |
plot(model1$fitted.values, model1$residual.values) | |
hist(model1$residuals) | |
model2 = lm(forestfirescluster2$area~forestfirescluster2$DMCDC+forestfirescluster2$tempwind +forestfirescluster2$FFMCDMCDC+forestfirescluster2$DMCDC+forestfirescluster2$XY+forestfirescluster2$XXYY+forestfirescluster2$X+forestfirescluster2$Y+forestfirescluster2$FFMC+forestfirescluster2$DMC+forestfirescluster2$DC+forestfirescluster2$ISI+forestfirescluster2$temp+forestfirescluster2$RH+forestfirescluster2$wind+forestfirescluster2$rain+factor(forestfirescluster2$month1)+factor(forestfirescluster2$day1),na.action="na.exclude") | |
model2 | |
summary(model2) | |
plot(model2$fitted.values, model2$residual.values) | |
hist(model2$residuals) | |
model3 = lm(forestfirescluster3$area~forestfirescluster3$DMCDC+forestfirescluster3$tempwind +forestfirescluster3$FFMCDMCDC+forestfirescluster3$DMCDC+forestfirescluster3$XY+forestfirescluster3$XXYY+forestfirescluster3$X+forestfirescluster3$Y+forestfirescluster3$FFMC+forestfirescluster3$DMC+forestfirescluster3$DC+forestfirescluster3$ISI+forestfirescluster3$temp+forestfirescluster3$RH+forestfirescluster3$wind+forestfirescluster3$rain+factor(forestfirescluster3$month1)+factor(forestfirescluster3$day1),na.action="na.exclude") | |
model3 | |
summary(model3) | |
plot(model3$fitted.values, model3$residual.values) | |
hist(model3$residuals) | |
Happy Learning!!!
No comments:
Post a Comment