This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Assign vector list of messages | |
data <- c("This AAAA is a CAPITAL is a tweet123455, . ,,,,,,","This 333 spam , ... prize","Congratz Prize Winner22222222222343434","^&^*&*&^*&^*&") | |
#View few records | |
head(data) | |
#Library for data cleansing | |
library(tm) | |
#Convert to message corpus | |
msg_corpus <- Corpus(VectorSource(data)) | |
inspect(msg_corpus[1:4]) | |
#Remove punctuation | |
refine_corpus <- tm_map(msg_corpus, removePunctuation) | |
#String whitespaces | |
refine_corpus <- tm_map(refine_corpus, stripWhitespace) | |
#Convert to lower case | |
refine_corpus <- tm_map(refine_corpus, content_transformer(tolower)) | |
#Remove Numbers | |
refine_corpus <- tm_map(refine_corpus, removeNumbers) | |
#Remove Stop Words | |
refine_corpus <- tm_map(refine_corpus, removeWords, stopwords()) | |
writeLines(as.character(msg_corpus[1])) | |
writeLines(as.character(refine_corpus[1])) | |
writeLines(as.character(msg_corpus[2])) | |
writeLines(as.character(refine_corpus[2])) | |
writeLines(as.character(msg_corpus[3])) | |
writeLines(as.character(refine_corpus[3])) | |
writeLines(as.character(msg_corpus[4])) | |
writeLines(as.character(refine_corpus[4])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#set the working directory | |
setwd("E:/Data/") | |
#Tip #1 - List and iterate all files in directory | |
filenames <- list.files(pattern="*.csv") | |
print(filenames) | |
#Tip #2 - Loop through all files | |
for(i in seq_along(filenames)) | |
{ | |
print(filenames[i]) | |
} | |
#Tip #3 - Initialize Empty Vectors | |
a <- vector(mode="numeric", length=0) | |
b <- vector(mode="numeric", length=0) | |
No comments:
Post a Comment