"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

February 20, 2020

Day #324 - Plumbing Old 'R' Code

What I like in my Approach
  • R Example with 
  • Data Normalization
  • Cluster Data
  • Build Regression on top of Clustered Data

#http://www3.dsi.uminho.pt/pcortez/forestfires/
#Step 1 - Load Data and check summary details
setwd("E:/RNotes/RData/")
forestfires = NULL
forestfires <- read.csv(file = "forestfires.csv")
head(forestfires)
summary(forestfires)
#Step 2 - Visual Correlation between variables
#library(psych)
#pairs.panels(forestfires)
#Step 3 - Normalize columns where min and max values have large differences
#forestfires$DC <- log(forestfires$DC)
#forestfires$DMC <- log(forestfires$DMC)
forestfires = subset(forestfires, forestfires$area>0)
nrow(forestfires)
#forestfires$area <- (forestfires$area-min(forestfires$area))/(max(forestfires$area)-min(forestfires$area))
forestfires$area <- log(forestfires$area)
forestfires$X = (forestfires$X-min(forestfires$X))/(max(forestfires$X)-min(forestfires$X))
forestfires$Y = (forestfires$Y-min(forestfires$Y))/(max(forestfires$Y)-min(forestfires$Y))
forestfires$XY = forestfires$X*forestfires$Y
forestfires$XXYY = forestfires$X*forestfires$X*forestfires$Y*forestfires$Y
forestfires$XY = (forestfires$XY-min(forestfires$XY))/(max(forestfires$XY)-min(forestfires$XY))
forestfires$XXYY = (forestfires$XXYY-min(forestfires$XXYY))/(max(forestfires$XXYY)-min(forestfires$XXYY))
forestfires$FFMC = (forestfires$FFMC-min(forestfires$FFMC))/(max(forestfires$FFMC)-min(forestfires$FFMC))
#forestfires$DC <- log(forestfires$DC)
#forestfires$DC <- forestfires$DC
#forestfires$DMC <- forestfires$DMC
forestfires$DMCDC <- forestfires$DMC*forestfires$DC
forestfires$tempwind <- forestfires$temp*forestfires$wind
forestfires$FFMCDMCDC <- forestfires$FFMC*forestfires$DMC*forestfires$DC
#print(unique(forestfires$month))
#print(unique(forestfires$day))
sample(forestfires)
#Step 4 - Replace month and data text to numeric values as below
forestfires$month1[forestfires$month=='jan'] = 1
forestfires$month1[forestfires$month=='feb'] = 2
forestfires$month1[forestfires$month=='mar'] = 3
forestfires$month1[forestfires$month=='apr'] = 4
forestfires$month1[forestfires$month=='may'] = 5
forestfires$month1[forestfires$month=='jun'] = 6
forestfires$month1[forestfires$month=='jul'] = 7
forestfires$month1[forestfires$month=='aug'] = 8
forestfires$month1[forestfires$month=='sep'] = 9
forestfires$month1[forestfires$month=='oct'] = 10
forestfires$month1[forestfires$month=='nov'] = 11
forestfires$month1[forestfires$month=='dec'] = 12
forestfires$day1[forestfires$day=='sun'] = 0
forestfires$day1[forestfires$day=='mon'] = 1
forestfires$day1[forestfires$day=='tue'] = 2
forestfires$day1[forestfires$day=='wed'] = 3
forestfires$day1[forestfires$day=='thu'] = 4
forestfires$day1[forestfires$day=='fri'] = 5
forestfires$day1[forestfires$day=='sat'] = 6
#Step 5 - Remove original columns
forestfires$day <- NULL
forestfires$month <- NULL
#Step 6 - Run K means
library(NbClust)
set.seed(1)
numberofclusters <- NbClust(forestfires,min.nc=2,max.nc=15,method="kmeans")
#Step 7 - Print results
table(numberofclusters$Best.n[1,])
grpForest <- kmeans( forestfires, centers=3,nstart=10)
grpForest$cluster
grpForest$centers
grpForest$withinss
grpForest$size
forestfirescluster1 = NULL
forestfirescluster2 = NULL
forestfirescluster3 = NULL
clusterresults = c(grpForest$cluster)
length(clusterresults)
nrow(forestfires)
for(i in 1:nrow(forestfires))
{
if(clusterresults[i]==1)
{
print('Cluster 1')
deldata = forestfires[i,]
forestfirescluster1 = rbind(forestfirescluster1,deldata)
}
if(clusterresults[i]==2)
{
print('Cluster 2')
deldata = forestfires[i,]
forestfirescluster2 = rbind(forestfirescluster2,deldata)
}
if(clusterresults[i]==3)
{
print('Cluster 3')
deldata = forestfires[i,]
forestfirescluster3 = rbind(forestfirescluster3,deldata)
}
}
nrow(forestfirescluster1)
nrow(forestfirescluster2)
nrow(forestfirescluster3)
#df2<-forestfirescluster1[complete.cases(forestfirescluster1),]
#forestfirescluster1 = NULL
#forestfirescluster1 = df2
#df2<-forestfirescluster3[complete.cases(forestfirescluster3),]
#forestfirescluster3 = NULL
#forestfirescluster3 = df2
nrow(forestfirescluster1)
nrow(forestfirescluster2)
nrow(forestfirescluster3)
model1 = lm(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude")
model1
summary(model1)
plot(model1$fitted.values, model1$residual.values)
hist(model1$residuals)
library(leaps)
model1 = regsubsets(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude",data=forestfirescluster1,nbest = 8)
model1
summary(model1)
plot(model1$fitted.values, model1$residual.values)
hist(model1$residuals)
model2 = lm(forestfirescluster2$area~forestfirescluster2$DMCDC+forestfirescluster2$tempwind +forestfirescluster2$FFMCDMCDC+forestfirescluster2$DMCDC+forestfirescluster2$XY+forestfirescluster2$XXYY+forestfirescluster2$X+forestfirescluster2$Y+forestfirescluster2$FFMC+forestfirescluster2$DMC+forestfirescluster2$DC+forestfirescluster2$ISI+forestfirescluster2$temp+forestfirescluster2$RH+forestfirescluster2$wind+forestfirescluster2$rain+factor(forestfirescluster2$month1)+factor(forestfirescluster2$day1),na.action="na.exclude")
model2
summary(model2)
plot(model2$fitted.values, model2$residual.values)
hist(model2$residuals)
model3 = lm(forestfirescluster3$area~forestfirescluster3$DMCDC+forestfirescluster3$tempwind +forestfirescluster3$FFMCDMCDC+forestfirescluster3$DMCDC+forestfirescluster3$XY+forestfirescluster3$XXYY+forestfirescluster3$X+forestfirescluster3$Y+forestfirescluster3$FFMC+forestfirescluster3$DMC+forestfirescluster3$DC+forestfirescluster3$ISI+forestfirescluster3$temp+forestfirescluster3$RH+forestfirescluster3$wind+forestfirescluster3$rain+factor(forestfirescluster3$month1)+factor(forestfirescluster3$day1),na.action="na.exclude")
model3
summary(model3)
plot(model3$fitted.values, model3$residual.values)
hist(model3$residuals)
view raw ForestFires.R hosted with ❤ by GitHub

Happy Learning!!!

No comments: