In this introduction we will explore some useful R functions for data preparation. We will look very quickly at clustering and classification¶

#1.  First read the data from a CSV file into an R dataframe 
W_df_orig = read.table('weather_orig.csv',
                      header=TRUE,
                      sep=",",
                      stringsAsFactors = TRUE) 
dim(W_df_orig)

Quick way to view some rows:¶

head(W_df_orig)

tail(W_df)

Error in tail(W_df): object 'W_df' not found
Traceback:

1. tail(W_df)

str(W_df)      #Quick view of the basic 'structure' of the data frame

Subsetting rows and columns¶

#1 Keep rows that are NOT missing data
keep_ind  = complete.cases(W_df_orig)
W_df      = W_df_orig[keep_ind,]

Y=as.numeric(W_df[,'RainTomorrow'])  #save thsi for later

#2 subset with select is good way to remove columns
W_df = subset(W_df, select=-c(RISK_MM,RainTomorrow))

dim(W_df)

# use as.Date( ) to convert strings to dates 
W_df_date <- as.Date(W_df[,'Date'])
W_df_date[366]-W_df_date[1]    #now you can subtract dates

Time difference of NA days

Data preparation snippet: reshaping a dataframe with 'dcast' command¶

Now, imagine that each day we want to list a measurement for each wind direction all in the same row. You might think of it as doing an ANOVA where each factor level is it's own variable.

Run this section and notice what the new row looks like, Where are the new columns?

library(reshape2)

# long to wide: ie 'cast' repeated measure into wide table
W_long   =dcast(W_df,  
              formula=Date+Location+ ...~ WindGustDir,   
                     # the ' ... '  means take all the rest of the columns as the index
                     # WindGustDir entries are labels for the repeated measures
               fill=0,     #fill in empty cells; this could be 0 or NA, for example.
               value.var="WindGustSpeed")   
                     #this variable has the repeated measurement values

head(W_long)
#optional: write.csv(W_cast,file='Weather_castwide.csv')

Get factors using SVD¶

#1 Get numeric columns only
cols_numeric = sapply(W_df,is.numeric)   #get column classes as a list
W_dfnum       = W_df[,which(cols_numeric)]
dim(W_dfnum)

#2 turn it into a matrix
W_matrix = as.matrix(W_dfnum)

#3 mean center data
W_mncntr=scale(W_dfnum,center=TRUE,scale=FALSE)

#4 run SVD command
Wsvd=svd(W_mncntr)
str(Wsvd)

List of 3
 $ d: num [1:16] 367 307 215 164 109 ...
 $ u: num [1:328, 1:16] -0.03129 -0.01506 0.03569 0.00638 0.00357 ...
 $ v: num [1:16, 1:16] -0.0775 -0.2114 0.0446 -0.0778 -0.128 ...

Get some kmean cluster and plot onto first two SVD factors¶

#get Kmeans for 4 clusters, with 10 iterations and 1 starting points
k4             = kmeans(W_mncntr,4,10,1)

#set color scheme
col2use        = c('red','blue','black','yellow')
#set cluster assignment in colors
colassignments = col2use[k4$cluster]

W_proj = as.matrix(W_mncntr) %*% Wsvd$v[,1:3]    #project data onto 3 components

plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by kmeans')

# to plot center points, first project them into components
c3 = k4$centers%*% Wsvd$v[,1:3]
points(c3[,1],c3[,2],pch=8,cex=2)

#Y was created above, use it to select 2 colors

#get class assignment in colors
colassignments = col2use[Y]

plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by class')

Get simple classification model results¶

Let's run a linear model, make up a simple decision threshold to get a classification model of whether it will 'Rain Tomorrow'. Recall that the variable Y holds the outcomes 'Rain Tomorrow'

linmodel_result = lm(Y~.,data=W_dfnum)   #Y is either 1 or 2, so let 1.5 be the cutoff of 'NO' rain prediction or 'YES'

#Note the 'Y~.'  is R's formula notation for Y=f( everything else)

summary(Y)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.183   1.000   2.000

#get model predictions (more generally we would use a test set to get prediction accuracy estimates)
Y_pred=linmodel_result$fitted.values  

#get the indices of predictions NO vs YES
Y_pred1_indices   =which(Y_pred<1.5)
Y_pred2_indices   =which(Y_pred>=1.5)

#set up No,Yes predictions
Y_pred_class                 =matrix(1,length(Y),1)  
Y_pred_class[Y_pred2_indices]=2

#show a confusion matrix 
table(Y,Y_pred_class)

   Y_pred_class
Y     1   2
  1 259   9
  2  31  29

#Now color the predictions onto the 2 SVD dimensions

colassignments = col2use[Y_pred_class]

plot(W_proj[,1],W_proj[,2],col=colassignments,main='data pts project to 1,2 SVD components, colored by PREDICTED class')

Date	Location	MinTemp	MaxTemp	Rainfall	Evaporation	Sunshine	WindGustDir	WindGustSpeed	WindDir9am	⋯	Humidity3pm	Pressure9am	Pressure3pm	Cloud9am	Cloud3pm	Temp9am	Temp3pm	RainToday	RISK_MM	RainTomorrow
2007-11-01	Canberra	8.0	24.3	0.0	3.4	6.3	NW	30	SW	⋯	29	1019.7	1015.0	7	7	14.4	23.6	No	3.6	Yes
2007-11-02	Canberra	14.0	26.9	3.6	4.4	9.7	ENE	39	E	⋯	36	1012.4	1008.4	5	3	17.5	25.7	Yes	3.6	Yes
2007-11-03	Canberra	13.7	23.4	3.6	5.8	3.3	NW	85	N	⋯	69	1009.5	1007.2	8	7	15.4	20.2	Yes	39.8	Yes
2007-11-04	Canberra	13.3	15.5	39.8	7.2	9.1	NW	54	WNW	⋯	56	1005.5	1007.0	2	7	13.5	14.1	Yes	2.8	Yes
2007-11-05	Canberra	7.6	16.1	2.8	5.6	10.6	SSE	50	SSE	⋯	49	1018.3	1018.5	7	7	11.1	15.4	Yes	0.0	No
2007-11-06	Canberra	6.2	16.9	0.0	5.8	8.2	SE	44	SE	⋯	57	1023.8	1021.7	7	5	10.9	14.8	No	0.2	No

Date	Location	MinTemp	MaxTemp	Rainfall	Evaporation	Sunshine	WindDir9am	WindDir3pm	WindSpeed9am	⋯	NW	SE	SSE
2007-11-01	Canberra	8.0	24.3	0.0	3.4	6.3	SW	NW	6	⋯	30	0	0
2007-11-02	Canberra	14.0	26.9	3.6	4.4	9.7	E	W	4	⋯	0	0	0
2007-11-03	Canberra	13.7	23.4	3.6	5.8	3.3	N	NNE	6	⋯	85	0	0
2007-11-04	Canberra	13.3	15.5	39.8	7.2	9.1	WNW	W	30	⋯	54	0	0
2007-11-05	Canberra	7.6	16.1	2.8	5.6	10.6	SSE	ESE	20	⋯	0	0	50
2007-11-06	Canberra	6.2	16.9	0.0	5.8	8.2	SE	E	20	⋯	0	44	0