Step 06: Score Mandals Kruskal

##############################################################
######## Project Name : SATSURE Credit Risk Product
######## File Usage : This scripts writes mandalwise yearly scores and also gets the optimal alpha for exponential weighted average score for the out of the year
######## Date : November 20, 2018
######## Author : Actify Data Labs
##############################################################

# How to run the script
# Rscript score_mandals_kruskal.R <crop_type> <district> <out_year>

### Passing Arguments
args <- commandArgs(TRUE)
crop_type <- tolower(args[1])
district <- tools::toTitleCase(args[2])
# out_year <- as.numeric(args[3])

### libraries
library(dplyr)
library(glue)
library(stringr)
library(moments)
library(tidyr)
library(data.table)

## params
#out_year <- 2017
#crop_type <- “maize”
#district <- “Srikakulam”
clusters_path <- glue(“/home/ubuntu/results/preprocessed/{district}/mandalwise_clusters_dataframes/{crop_type}”)
centroids_files_path <- glue(“/home/ubuntu/results/preprocessed/{district}/centroid_objects/{crop_type}”)
### Files list
clusters_files_lst <- list.files(clusters_path,full.names = TRUE,include.dirs = FALSE)

mandal_names <- lapply(clusters_files_lst, function(x){
names <- strsplit(x,”/”)[[1]] %>% tail(1) %>% strsplit(.,”_”) %>% unlist() %>% head(1)
return(names)
})

names(clusters_files_lst) <- mandal_names

#df <- data.table::fread(clusters_files_lst[[2]])
#mandal_name <- mandal_names[2]

### Cluster column names
n_clusters <- paste0(“k”,seq(2,6,1))
names(n_clusters) <- n_clusters

years <- as.character(seq(2013,2017,1))
names(years) <- years

mandal_scores <- mapply(function(file,mandal_name){
df <- data.table::fread(file)
## Getting best cluster yearwise for a mandal based on kruskal wallis statistic
yearwise_best_cluster <- lapply(years, function(y){
df_year <- df %>% filter(year == y)

if(nrow(df_year)==0){
return(NULL)
}
cluster_wise_kruskals <- lapply(n_clusters, function(k){
## Kruskal Wallis test across clusters for max value of pixels
kruskal <- kruskal.test(ndvi_stats_row_max ~ get(k), data = df_year)$statistic
return(kruskal)
})
max_kruskal_cluster_name <- names(which.max(cluster_wise_kruskals))
return(max_kruskal_cluster_name)
})

## Read centroids objects for the mandal for all years and clusters (2:6)
mandal_centroid_files <- list.files(centroids_files_path) %>% grep(pattern = glue(‘^{mandal_name}’), value = TRUE)
mandal_centroid_files <- paste0(centroids_files_path,”/”,mandal_centroid_files)
names(mandal_centroid_files) <- str_extract(mandal_centroid_files, pattern = “[0-9]{4}”)
centroids <- lapply(mandal_centroid_files, readRDS)

## Getting best no of clusters corresponding to an year
best_ks <- lapply(years, function(x) yearwise_best_cluster[[x]])
## Removing the NULL years for which clustering is not done
best_ks[sapply(best_ks, is.null)] <- NULL

## Getting aggregate metric for a mandal for each year and best k (maximum of product of NDVI and NDWI rooted)
stats <- mapply(function(yr,best_k){
k <- yr[[best_k]]
lapply(k,function(cluster){
if (ncol(cluster) == 2){
return(sqrt(max(apply(cluster,1,prod))))}
else {
return(max(cluster))
}
})

},centroids,best_ks,SIMPLIFY = FALSE)

stats_unlst <- unlist(stats)

## Getting weighted average value of agg score(across clusters) at mandal and year level
df_scores <- mapply(function(cluster_no,year){
cluster_score <- df %>%
filter(year == year) %>%
group_by(!!sym(cluster_no)) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate (year = year) %>%
### Append scores on year level to the corresponding data
cbind(.,max_prod = stats_unlst[names(stats_unlst) %>% grep(pattern = year, value = TRUE)]) %>%
group_by(year) %>%
summarise(score = sum(n*max_prod)/sum(n)) %>%
ungroup() %>%
as.data.frame()
return(cluster_score)
},best_ks,names(best_ks),SIMPLIFY = FALSE)

## row bind all years weighted average scores for each mandal to create dataframe
return(df_scores %>% bind_rows()
)}, clusters_files_lst,names(clusters_files_lst), SIMPLIFY = FALSE)

######### Converting mandal_scores into a clean dataftame with mandal and yearwise scores as columns
##### Getting out of years scores in csv
mandal_scores_list <- lapply(mandal_scores, function(mandal){
mandal_t <- as.data.frame(t(mandal[,-1]))
colnames(mandal_t) <- paste0(“X”,mandal$year)
return(mandal_t)
})

mandal_scores_df <- dplyr::bind_rows(mandal_scores_list) %>% cbind(mandal = names(mandal_scores_list),.)
mandal_scores_df[is.na(mandal_scores_df)] <- 0
#### Write 3 years data to csv
fwrite(mandal_scores_df,glue(‘/home/ubuntu/results/{district}_{crop_type}_mandal_scores.csv’),sep = “,”,row.names = FALSE)