# A few useful R Code Snippets

For when those moments arise and you know you’ve done something before and yet can’t seem to quickly find it, some quick R Code that has served me well.

## Couple of Basics

library(‘psych’)
describe(datasetname)

Charts that I mix-up:
par(mfrow=c(3,1))    #3 rows, 1 column

## Check for NA Values

sum(sapply(df_tnt, function(x) { sum(is.na(x)) }))
unlist(lapply(df_tnt , function(x) any(is.na(x))))

## Correlogram that Rocks

library(corrplot)
corrplotdata <- cor(cc_train[as.numeric(which(sapply(cc_train, class)==”numeric”))])
corrplot(corrplotdata, type=”upper”, order=”hclust”, tl.col=”black”, tl.srt=60)
#with signficnce

cor.mtest <- function(mat, …) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n – 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], …)
p.mat[i, j] <- p.mat[j, i] <- tmp\$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}

p.mat <- cor.mtest(corrplotdata)
par(mar=c(0,0,6,1)+.1) #bottom left top right
#par(oma=c(0,0,4,0))
corrplot(corrplotdata, type=”lower”, p.mat = p.mat, order=”hclust”, sig.level = 0.01, insig=”blank”, title=” \n Significance of Variables” , tl.col=”black”, number.font=8, tl.srt=20, method=”pie”) #pvalues >.01 are considered insignficant and left blank \n is for a line break

## MICE

#Run Mice to fix NA items Replace missing values
#pmm=predictive mean matching used for numeric data and lda on categorical

#Refer to pg 54 MICE Options  “logreg”, “polyreg” “polr” for binary/categorical models
library(VIM)
library(mice)
md.pattern(mb_df)

aggr_plot <- aggr(mb_df, col=c(‘navyblue’, ‘red’), numbers=TRUE, sortVars=TRUE, labels=names(mb_df), cex.axis=.7, gap=3, ylab=c(“Histogram of missing data”, “Pattern”))

mb_df\$STARS <- factor(mb_df\$STARS)

library(mice)
init = mice(mb_df, maxit=50)
meth = init\$method
predM = init\$predictorMatrix
meth[c(“pH”)] = “pmm”
meth[c(“ResidualSugar”)] = “pmm”
meth[c(“Chlorides”)] = “pmm”
meth[c(“FreeSulfurDioxide”)] = “pmm”
meth[c(“Alcohol”)] = “pmm”
meth[c(“TotalSulfurDioxide”)] = “pmm”
meth[c(“Sulphates”)] = “pmm”
meth[c(“STARS”)] = “lda”
set.seed(103)
tempData = mice(mb_df, method=meth, predictorMatrix=predM, m=5)

summary(tempData)
mb_df2 <- as.data.frame(complete(tempData,1))
describe(mb_df2)

## Determine if a variable needs transforming

# —Evaluate whether to transform a variable
cc_var_view <- credit_card_default\$PAY_AMT1_trans
#cc_var_view <- cc_train\$cr_util_bill_amnt1_trans

hist(cc_var_view, breaks=10, freq=FALSE,main=’Histogram of PAY_AMT1_trans variable’, xlab=’Coefficient Estimation’)
curve(dnorm(x, mean=mean(cc_var_view), sd=sd(cc_var_view)), col=’red’, lwd=3, add=TRUE)
qqnorm(cc_var_view, main = “Q-Q Plot of PAY_AMT1_trans”)
qqline(cc_var_view, col=”red”, main=”ff”)

# — end of variable Evaluation

#— transform chosen variables —-
cc_train\$age_trans <- cc_train\$AGE^-.5
cc_train\$age_trans

cc_train\$limit_bal_trans <- cc_train\$LIMIT_BAL^.3
cc_train\$limit_bal_trans

cc_train\$BILL_AMT1_trans <- log(cc_train\$BILL_AMT1+1)
cc_train\$BILL_AMT1_trans[is.infinite(cc_train\$BILL_AMT1_trans) | is.nan(cc_train\$BILL_AMT1_trans) ] <- 0 #replace na’s and inf with zeros
cc_train\$BILL_AMT1_trans

## Leveling different factors between data sets

#level all factors
# get the union of levels between train and test for Bank and PAY_1
PAY_2_levels <- union(levels(cc_test\$PAY_2), levels(cc_train\$PAY_2))
PAY_1_levels <- union(levels(cc_test\$PAY_1), levels(cc_train\$PAY_1))

# rebuild Bank with union of levels
cc_test\$PAY_2 <- with(cc_test, factor(PAY_2, levels = PAY_2_levels))
cc_train\$PAY_2<- with(cc_train, factor(PAY_2, levels = PAY_2_levels))

# rebuild PAY_1 with union of levels
cc_test\$PAY_1 <- with(cc_test, factor(PAY_1, levels = PAY_1_levels))
cc_train\$PAY_1 <- with(cc_train, factor(PAY_1, levels = PAY_1_levels))

#finished equalizing levels on selected variables

## Recode Factors

library(car)

dataframe\$PAY_1=”-2″ <- recode(cc_train\$PAY_1, “-2=0; 5=7; 6=8; 7=9; 8=10”)  #left become right i.e. 8 is updated to reflect 10

## Evaluate Results using ROC, AUROC, etc

mean(cc_lr_pred_m5_def_f == actual_data )
caret::confusionMatrix(cc_lr_pred_m5_def_f, actual_data, positive=”1″)

InformationValue::plotROC(actual_data, Predicted_values)
InformationValue::AUROC(actual_data, Predicted_values)
InformationValue::Concordance(actual_data, Predicted_values)\$Concordance
InformationValue::ks_plot(actual_data, Predicted_values)
InformationValue::ks_stat(actual_data, Predicted_values ) #, returnKSTable = TRUE)
#Lift on TEST data
actual_datab <- as.numeric(actual_data)
gains.cross_test <- gains(actual_datab, Predicted_values, groups=20 )
gains.cross_test
plot(gains.cross_test, main=”Cumulative Lift Chart – Test”)

#AIC(Predicted_values)
logLik(cc_lr_model5)

## Replace Missing Values based on another Variable in a Separate Column

#Replace missing values with those from the related data – precipitation
ind3 <- is.na(df_tnt\$precipitation_amt_mm)
which(is.na(df_tnt\$precipitation_amt_mm)) #
which(is.na(df_tnt\$reanalysis_min_air_temp_k)) #
df_tnt\$precipitation_amt_mm[ind3] <- with(df_tnt,(reanalysis_min_air_temp_k[ind3]- 273.15)) # Using a formula to update variable i.e. from Kelvin to Celsius
table(is.na(df_tnt\$precipitation_amt_mm ))
table(is.na(df_tnt\$reanalysis_precip_amt_kg_per_m2))

## Filter a column i.e. two cities

df_train_sj_all = df_tnt %>% filter(city == ‘sj’)  #create a data frame for all cities with ‘sj’
df_train_iq_all = df_tnt %>% filter(city == ‘iq’)