#1. First read the data from a CSV file into an R dataframe
W_df_orig = read.table('weather_orig.csv',
header=TRUE,
sep=",",
stringsAsFactors = TRUE)
dim(W_df_orig)
head(W_df_orig)
tail(W_df)
str(W_df) #Quick view of the basic 'structure' of the data frame
#1 Keep rows that are NOT missing data
keep_ind = complete.cases(W_df_orig)
W_df = W_df_orig[keep_ind,]
Y=as.numeric(W_df[,'RainTomorrow']) #save thsi for later
#2 subset with select is good way to remove columns
W_df = subset(W_df, select=-c(RISK_MM,RainTomorrow))
dim(W_df)
# use as.Date( ) to convert strings to dates
W_df_date <- as.Date(W_df[,'Date'])
W_df_date[366]-W_df_date[1] #now you can subtract dates
Now, imagine that each day we want to list a measurement for each wind direction all in the same row. You might think of it as doing an ANOVA where each factor level is it's own variable.
Run this section and notice what the new row looks like, Where are the new columns?
library(reshape2)
# long to wide: ie 'cast' repeated measure into wide table
W_long =dcast(W_df,
formula=Date+Location+ ...~ WindGustDir,
# the ' ... ' means take all the rest of the columns as the index
# WindGustDir entries are labels for the repeated measures
fill=0, #fill in empty cells; this could be 0 or NA, for example.
value.var="WindGustSpeed")
#this variable has the repeated measurement values
head(W_long)
#optional: write.csv(W_cast,file='Weather_castwide.csv')
#1 Get numeric columns only
cols_numeric = sapply(W_df,is.numeric) #get column classes as a list
W_dfnum = W_df[,which(cols_numeric)]
dim(W_dfnum)
#2 turn it into a matrix
W_matrix = as.matrix(W_dfnum)
#3 mean center data
W_mncntr=scale(W_dfnum,center=TRUE,scale=FALSE)
#4 run SVD command
Wsvd=svd(W_mncntr)
str(Wsvd)
#get Kmeans for 4 clusters, with 10 iterations and 1 starting points
k4 = kmeans(W_mncntr,4,10,1)
#set color scheme
col2use = c('red','blue','black','yellow')
#set cluster assignment in colors
colassignments = col2use[k4$cluster]
W_proj = as.matrix(W_mncntr) %*% Wsvd$v[,1:3] #project data onto 3 components
plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by kmeans')
# to plot center points, first project them into components
c3 = k4$centers%*% Wsvd$v[,1:3]
points(c3[,1],c3[,2],pch=8,cex=2)
#Y was created above, use it to select 2 colors
#get class assignment in colors
colassignments = col2use[Y]
plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by class')
Let's run a linear model, make up a simple decision threshold to get a classification model of whether it will 'Rain Tomorrow'. Recall that the variable Y holds the outcomes 'Rain Tomorrow'
linmodel_result = lm(Y~.,data=W_dfnum) #Y is either 1 or 2, so let 1.5 be the cutoff of 'NO' rain prediction or 'YES'
#Note the 'Y~.' is R's formula notation for Y=f( everything else)
summary(Y)
#get model predictions (more generally we would use a test set to get prediction accuracy estimates)
Y_pred=linmodel_result$fitted.values
#get the indices of predictions NO vs YES
Y_pred1_indices =which(Y_pred<1.5)
Y_pred2_indices =which(Y_pred>=1.5)
#set up No,Yes predictions
Y_pred_class =matrix(1,length(Y),1)
Y_pred_class[Y_pred2_indices]=2
#show a confusion matrix
table(Y,Y_pred_class)
#Now color the predictions onto the 2 SVD dimensions
colassignments = col2use[Y_pred_class]
plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by PREDICTED class')