A few useful R Code Snippets
For when those moments arise and you know you’ve done something before and yet can’t seem to quickly find it, some quick R Code that has served me well.
Couple of Basics
library(‘psych’)
describe(datasetname)
Charts that I mix-up:
par(mfrow=c(3,1)) #3 rows, 1 column
Check for NA Values
sum(sapply(df_tnt, function(x) { sum(is.na(x)) }))
unlist(lapply(df_tnt , function(x) any(is.na(x))))
Correlogram that Rocks
library(corrplot)
corrplotdata <- cor(cc_train[as.numeric(which(sapply(cc_train, class)==”numeric”))])
corrplot(corrplotdata, type=”upper”, order=”hclust”, tl.col=”black”, tl.srt=60)
#with signficnce
cor.mtest <- function(mat, …) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n – 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], …)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(corrplotdata)
head(p.mat)
par(mar=c(0,0,6,1)+.1) #bottom left top right
#par(oma=c(0,0,4,0))
corrplot(corrplotdata, type=”lower”, p.mat = p.mat, order=”hclust”, sig.level = 0.01, insig=”blank”, title=” \n Significance of Variables” , tl.col=”black”, number.font=8, tl.srt=20, method=”pie”) #pvalues >.01 are considered insignficant and left blank \n is for a line break
MICE
#Run Mice to fix NA items Replace missing values
#pmm=predictive mean matching used for numeric data and lda on categorical
#Refer to pg 54 MICE Options “logreg”, “polyreg” “polr” for binary/categorical models
library(VIM)
library(mice)
md.pattern(mb_df)
aggr_plot <- aggr(mb_df, col=c(‘navyblue’, ‘red’), numbers=TRUE, sortVars=TRUE, labels=names(mb_df), cex.axis=.7, gap=3, ylab=c(“Histogram of missing data”, “Pattern”))
mb_df$STARS <- factor(mb_df$STARS)
library(mice)
init = mice(mb_df, maxit=50)
meth = init$method
predM = init$predictorMatrix
meth[c(“pH”)] = “pmm”
meth[c(“ResidualSugar”)] = “pmm”
meth[c(“Chlorides”)] = “pmm”
meth[c(“FreeSulfurDioxide”)] = “pmm”
meth[c(“Alcohol”)] = “pmm”
meth[c(“TotalSulfurDioxide”)] = “pmm”
meth[c(“Sulphates”)] = “pmm”
meth[c(“STARS”)] = “lda”
set.seed(103)
tempData = mice(mb_df, method=meth, predictorMatrix=predM, m=5)
summary(tempData)
mb_df2 <- as.data.frame(complete(tempData,1))
describe(mb_df2)
Determine if a variable needs transforming
library(“e1071”)
library(“caret”)
library(“MASS”)
library(“lattice”)
# —Evaluate whether to transform a variable
cc_var_view <- credit_card_default$PAY_AMT1_trans
#cc_var_view <- cc_train$cr_util_bill_amnt1_trans
hist(cc_var_view, breaks=10, freq=FALSE,main=’Histogram of PAY_AMT1_trans variable’, xlab=’Coefficient Estimation’)
curve(dnorm(x, mean=mean(cc_var_view), sd=sd(cc_var_view)), col=’red’, lwd=3, add=TRUE)
qqnorm(cc_var_view, main = “Q-Q Plot of PAY_AMT1_trans”)
qqline(cc_var_view, col=”red”, main=”ff”)
# — end of variable Evaluation
#— transform chosen variables —-
cc_train$age_trans <- cc_train$AGE^-.5
cc_train$age_trans
cc_train$limit_bal_trans <- cc_train$LIMIT_BAL^.3
cc_train$limit_bal_trans
cc_train$BILL_AMT1_trans <- log(cc_train$BILL_AMT1+1)
cc_train$BILL_AMT1_trans[is.infinite(cc_train$BILL_AMT1_trans) | is.nan(cc_train$BILL_AMT1_trans) ] <- 0 #replace na’s and inf with zeros
cc_train$BILL_AMT1_trans
Leveling different factors between data sets
#level all factors
# get the union of levels between train and test for Bank and PAY_1
PAY_2_levels <- union(levels(cc_test$PAY_2), levels(cc_train$PAY_2))
PAY_1_levels <- union(levels(cc_test$PAY_1), levels(cc_train$PAY_1))
# rebuild Bank with union of levels
cc_test$PAY_2 <- with(cc_test, factor(PAY_2, levels = PAY_2_levels))
cc_train$PAY_2<- with(cc_train, factor(PAY_2, levels = PAY_2_levels))
# rebuild PAY_1 with union of levels
cc_test$PAY_1 <- with(cc_test, factor(PAY_1, levels = PAY_1_levels))
cc_train$PAY_1 <- with(cc_train, factor(PAY_1, levels = PAY_1_levels))
#finished equalizing levels on selected variables
Recode Factors
dataframe$PAY_1=”-2″ <- recode(cc_train$PAY_1, “-2=0; 5=7; 6=8; 7=9; 8=10”) #left become right i.e. 8 is updated to reflect 10
Evaluate Results using ROC, AUROC, etc
mean(cc_lr_pred_m5_def_f == actual_data )
caret::confusionMatrix(cc_lr_pred_m5_def_f, actual_data, positive=”1″)
InformationValue::plotROC(actual_data, Predicted_values)
InformationValue::AUROC(actual_data, Predicted_values)
InformationValue::Concordance(actual_data, Predicted_values)$Concordance
InformationValue::ks_plot(actual_data, Predicted_values)
InformationValue::ks_stat(actual_data, Predicted_values ) #, returnKSTable = TRUE)
#Lift on TEST data
actual_datab <- as.numeric(actual_data)
gains.cross_test <- gains(actual_datab, Predicted_values, groups=20 )
gains.cross_test
plot(gains.cross_test, main=”Cumulative Lift Chart – Test”)
#AIC(Predicted_values)
logLik(cc_lr_model5)
Replace Missing Values based on another Variable in a Separate Column
#Replace missing values with those from the related data – precipitation
ind3 <- is.na(df_tnt$precipitation_amt_mm)
which(is.na(df_tnt$precipitation_amt_mm)) #
which(is.na(df_tnt$reanalysis_min_air_temp_k)) #
df_tnt$precipitation_amt_mm[ind3] <- with(df_tnt,(reanalysis_min_air_temp_k[ind3]- 273.15)) # Using a formula to update variable i.e. from Kelvin to Celsius
table(is.na(df_tnt$precipitation_amt_mm ))
table(is.na(df_tnt$reanalysis_precip_amt_kg_per_m2))
Filter a column i.e. two cities
df_train_sj_all = df_tnt %>% filter(city == ‘sj’) #create a data frame for all cities with ‘sj’
df_train_iq_all = df_tnt %>% filter(city == ‘iq’)