Step 08: Mandal Scaled Scoring with Probs

##############################################################
######## Project Name : SATSURE Credit Risk Product
######## File Usage : This script creates and validates mandal groups (good, average) based on actual scores for out of the year data for different alphas
######## Date : November 20, 2018
######## Author : Actify Data Labs
##############################################################

#How to run the script
# Rscript mandal_grouping.R <crop_type> <district> <scoring_year>

## Load Libraries
library(data.table)
library(dplyr)
library(glue)
library(openxlsx)
library(DT)

## Parameters
args <- commandArgs(TRUE)
crop_type <- tolower(args[1])
district <- tools::toTitleCase(args[2])
yr <- as.numeric(args[3])
yr <- paste0(“X”,yr)

# district <- “Vizianagaram”
# crop_type <- “maize”
# yr <- “X2016”

scoring_year <- as.numeric(substr(yr,2,5))
year_3 <- paste0(“X”,scoring_year-1)
year_2 <- paste0(“X”,scoring_year-2)
year_1 <- paste0(“X”,scoring_year-3)

no_of_groups <- 2
temp_scaling_flag <- TRUE
no_of_years_to_be_used <- 3
alphas <- seq(0.01,0.99,0.01)
names(alphas) <- paste0(“alpha_”,alphas)

## Read year wise scores and temperature scaling data
mandal_scores <- fread(glue(“/home/ubuntu/results/{district}_{crop_type}_mandal_scores.csv”))
### Read weather scaling data
temp_scale_path <- glue(“/home/ubuntu/results/{district}_{crop_type}_temperature_scaling_factors.csv”)

### Check if temperature scaling file exists or else don’t scale
if (file.exists(temp_scale_path)){
temperature_scales <- fread(temp_scale_path)
} else{
stop(glue(“Temperature scaling Factors for {district} and {crop_type} doesn’t exist !!!!”))
}

if (temp_scaling_flag){
mandal_scores_scaled <- as.data.frame(as.matrix(temperature_scales %>% select(-c(“Mandal”))) * as.matrix(mandal_scores %>% select(-c(“mandal”))))
mandal_scores_scaled$mandal <- mandal_scores$mandal
#mandal_scores_scaled <- mandal_scores_scaled %>% select(mandal,X2013,X2014,X2015,X2016,X2017)
}else{
mandal_scores_scaled <- mandal_scores
}

### function to round all numeric columns in a dataframe
round_df <- function(x, digits) {
x <- as.data.frame(x)
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == ‘numeric’
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}

################################### Get optimum alpha #########################################################
mega_list <- pbapply::pbsapply(alphas,function(alpha){
apply(mandal_scores_scaled,1,function(mandal){
final_score <- alpha*(as.numeric(mandal[year_3]) + (1-alpha)*as.numeric(mandal[year_2]) + ((1-alpha)^2)*as.numeric(mandal[year_1]))
return(final_score)
})
})

mega_list <- as.data.frame(mega_list) %>% mutate(mandals = mandal_scores$mandal)

# Assigning names to alpha matrix
alpha_names <- names(alphas)
names(alpha_names) <- alpha_names

### Remove mandals with atleast one zero score year
df <- mandal_scores[apply(mandal_scores %>% select(-c(mandal)), 1, function(row){ all(row != 0)}),]
df <- df %>% select(mandal,!!sym(yr))

r <- nrow(df) %% no_of_groups
group_size <- (nrow(df) – r) / no_of_groups

overlap_grps <-lapply(alpha_names, function(alpha){

df$new_score <- mega_list[mega_list$mandals %in% df$mandal,alpha]
calculated_score <- df %>% select(mandal, new_score) %>% arrange(desc(new_score))
actual_score <- df %>% select(mandal, !!sym(yr)) %>% arrange(desc(!!sym(yr)))

calculated_score_gp <- split(calculated_score, c(rep(1:no_of_groups, each = group_size ), rep(no_of_groups,r)))
names(calculated_score_gp) <- c(“High”, “Non-High”)

yields_propensity<- mapply(function(group,yield_type){
group$yield_propensity <- yield_type
return(group)

},calculated_score_gp,names(calculated_score_gp),SIMPLIFY = FALSE) %>% bind_rows() %>%
select(-c(new_score)) %>%
mutate(crop_type = crop_type) %>% mutate(scoring_year = gsub(pattern = “X”, replacement = “”, x = yr))

actual_score_gp <- split(actual_score, c(rep(1:no_of_groups, each = group_size ), rep(no_of_groups,r)))

# Defining a temp matrix
temp_mat <- matrix(1:no_of_groups*no_of_groups, nrow = no_of_groups, ncol = no_of_groups)

for (x in 1:no_of_groups){
for(y in 1:no_of_groups){
temp_mat[x,y] = length(intersect(calculated_score_gp[[x]]$mandal,actual_score_gp[[y]]$mandal))
}

}
return(list(mat = temp_mat,yields_propensity = yields_propensity))

})

trace_vec <- lapply(overlap_grps, function(mat){
return(sum(diag(mat$mat)))
}) %>% unlist

trace_vec

max_alpha <- names(trace_vec[trace_vec == max(trace_vec)]) %>% last
print(max_alpha)

print(overlap_grps[[max_alpha]]$mat)

##### Use optimum alpha
alpha <- alphas[max_alpha]
mandal_scores <- mandal_scores %>% mutate(final_score = alpha*(!!sym(year_3) + (1-alpha)*!!sym(year_2) + (1-alpha)^2*!!sym(year_1)))

### Write the calculated scores to disk
df_final <- mandal_scores[apply(mandal_scores %>% select(-c(mandal)), 1, function(row){ all(row != 0)}),]
#write.csv(df_final,glue(“/home/ubuntu/results/{district}_{crop_type}_{yr}_calculated.csv”))

#### Scaling the scores
df_final <- df_final %>%
mutate(scaled_score = 100*(final_score – min(final_score))/(max(final_score) – min(final_score))) %>%
arrange(desc(scaled_score)) %>%
mutate(yield_propensity = ifelse(scaled_score >= median(scaled_score),”High”, “Non-High”)) %>%
mutate_if(.tbl = ., is.numeric,.funs = funs(round(., 2)))

###################### Log ODDs regression ###############################

# high_fail_prob <- overlap_grps[[max_alpha]][[1]][1,2] / sum(overlap_grps[[max_alpha]][[1]][1,])
# non_high_fail_prob <- overlap_grps[[max_alpha]][[1]][2,2] / sum(overlap_grps[[max_alpha]][[1]][2,])
#
# #### Log ODDs
# high_fail_prob_log <- log(high_fail_prob/non_high_fail_prob)
# non_high_fail_prob_log <- log(non_high_fail_prob/high_fail_prob)
#
# high_avg <- df_final %>% group_by(yield_propensity) %>% summarise(avg = mean(final_score)) %>% as.data.frame() %>% filter(yield_propensity == “High”) %>% pull(avg)
# non_high_avg <- df_final %>% group_by(yield_propensity) %>% summarise(avg = mean(final_score)) %>% as.data.frame() %>% filter(yield_propensity == “Non-High”) %>% pull(avg)
#
# slope <- (high_fail_prob_log – non_high_fail_prob_log)/(high_avg – non_high_avg)
# intercept <- high_fail_prob_log – slope*high_avg
#
# df_final$fail_prob <- sapply(df_final$final_score, function(score){return(1 – 1/(exp(slope*score + intercept)+1))})
# df_final$fail_prob <- df_final$fail_prob*100
# df_final <- round_df(df_final,4) %>% select(mandal,final_score,yield_propensity,fail_prob)

###################### Log ODDs regression ends ###############################

###################### Quantile Based probs ###################################

## Quantile
quantile <- stats::quantile(df_final$scaled_score,probs = seq(0,1,0.05))
upper_cap_val <- quantile[“95%”]
lower_cap_val <- quantile[“5%”]

df_final <- df_final %>%
mutate(scaled_score_trim = ifelse(scaled_score > upper_cap_val,upper_cap_val,scaled_score)) %>%
mutate(scaled_score_trim = ifelse(scaled_score_trim < lower_cap_val,lower_cap_val,scaled_score_trim))

df_final <- round_df(df_final,4) %>% select(mandal,final_score,yield_propensity,scaled_score_trim) %>%
rename(success_prob = scaled_score_trim)

###################### Quantile Based probs ends ###################################

# Writing final result
if(!file.exists(“~/results/mandalwise_scores_scaled.xlsx”)){
wb <- createWorkbook()
addWorksheet(wb, glue::glue(“{district}_{crop_type}_{substr(yr, 2, 5)}”))
writeData(wb, sheet = glue::glue(“{district}_{crop_type}_{substr(yr, 2, 5)}”) , x = df_final)
saveWorkbook(wb,”~/results/mandalwise_scores_scaled.xlsx”)

}else{
wb <- loadWorkbook(“~/results/mandalwise_scores_scaled.xlsx”)
addWorksheet(wb, glue::glue(“{district}_{crop_type}_{substr(yr, 2, 5)}”))
writeData(wb, sheet = glue::glue(“{district}_{crop_type}_{substr(yr, 2, 5)}”), x = df_final)
saveWorkbook(wb, “~/results/mandalwise_scores_scaled.xlsx”, overwrite = TRUE)
}