check_opt_data.Rmd

title: "check_opt_data"
author: "CB"
date: "7/28/2020"
output: html_document
knitr::opts_chunk$set(echo = TRUE)


### Running on cPouta or WIN local
on_cPouta <- TRUE


### Set path to input files

if(on_cPouta == TRUE) {
  # for pouta
  datapath <- "/media/volume/outp_rcp0_test/"
  path <- "/home/ubuntu/workspace/MultiForest2/"
  # path <- paste0(getwd(),"/")

} else {
  # for Windows
  path <- "C:/MyTemp/r_SIMO_output/output/MF_FIN/final_RCP0_FIN_V3/"

}


### load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(egg)


poutaxy <- read.csv(paste0(datapath, "rslt_RCP0_FIN_V7_XY.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
pouta <- read.csv(paste0(datapath, "rslt_RCP0_FIN_V7.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)

#poutaxy <- read.csv(paste0(datapath, "rslt_test_XY.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#pouta <- read.csv(paste0(datapath, "rslt_test.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)

length(unique(pouta$id))
unique(pouta$regime)
summary(pouta)

boxplot(pouta$V, main="Indicator: V")

V_out <- pouta %>% filter(V > 1500) %>% # Standing volume above xx m3/ha
  mutate(error = "V")
length(unique(V_out$id))

V_out_q <- pouta %>% filter(V > quantile(V,0.9995))
length(unique(V_out_q$id))


boxplot(pouta$i_Vm3, main="Indicator: i_Vm3")

i_Vm3_out <- pouta %>%  filter(i_Vm3 > 50) %>%  # annual volume increment above xx m3/ha
  mutate(error = "i_Vm3")
length(unique(i_Vm3_out$id))

i_Vm3_out_q <- pouta %>% filter(i_Vm3 > quantile(i_Vm3,0.9995))
length(unique(i_Vm3_out_q$id))


plotharvest <- pouta %>% filter(Harvested_V != 0)

boxplot(plotharvest$Harvested_V, main="Harvested_V")

Harvested_V_out <- pouta %>% filter(Harvested_V > 1000) %>% # Harvested volume above xx m3/ha
  mutate(error = "Harvested_V")
length(unique(Harvested_V_out$id))

Harvested_V_out_q <- pouta %>% filter(Harvested_V > quantile(Harvested_V,0.9995))
length(unique(Harvested_V_out_q$id))


boxplot(pouta$ALL_MARKETED_MUSHROOMS, main="ALL_MARKETED_MUSHROOMS" )

plot_mushroom <- pouta %>%
  filter(ALL_MARKETED_MUSHROOMS < 10000) # filter out the extrem high one for plotting

boxplot(plot_mushroom$ALL_MARKETED_MUSHROOMS, main="ALL_MARKETED_MUSHROOMS without main outlier")

mushroom_out <- pouta %>% filter(ALL_MARKETED_MUSHROOMS > 700) %>% # Mushrooms above xx kg/ha
  mutate(error = "ALL_MARKEDET_MUSHROOMS")
length(unique(mushroom_out$id))

mushroom_out_q <- pouta %>% filter(ALL_MARKETED_MUSHROOMS > quantile(ALL_MARKETED_MUSHROOMS,0.9995))
length(unique(mushroom_out_q$id))


boxplot(pouta$V_total_deadwood, main="V_total_deadwood")

deadwood_out <- pouta %>% filter(V_total_deadwood > 700) %>%  # Volume of Deadwood above xx m3/ha
  mutate(error = "V_total_deadwood")
length(unique(deadwood_out$id))

deadwood_out_q <- pouta %>% filter(V_total_deadwood > quantile(V_total_deadwood,0.9995))
length(unique(deadwood_out_q$id))


boxplot(pouta$N_where_D_gt_40, main="N_where_D_gt_40")

llt_out <- pouta %>% filter(N_where_D_gt_40 > 500) %>% # Number of big trees (> 40) above xx N/ha
  mutate(error = "N_where_D_gt_40")
length(unique(llt_out$id))

llt_out_q <- pouta %>% filter(N_where_D_gt_40 > quantile(N_where_D_gt_40,0.9995))
length(unique(llt_out_q$id))


boxplot(pouta$prc_V_deciduous, main="prc_V_deciduous")


plot_carbon <- pouta %>%
  filter(CARBON_SINK > -5000000)

boxplot(plot_carbon$CARBON_SINK, main="CARBON_SINK without the huge outlier")

carbon_sink_out <- pouta %>% filter(CARBON_SINK < -1000000 | CARBON_SINK > 1000000) %>% # source/sink of more than 1000 t CO2/ha
  mutate(error = "CARBON_SINK")
length(unique(carbon_sink_out$id))
unique(carbon_sink_out$regime)

carbon_sink_out_q <- pouta %>% filter(CARBON_SINK < quantile(CARBON_SINK,0.0005,na.rm = TRUE) | CARBON_SINK > quantile(CARBON_SINK,0.9995,na.rm = TRUE))
length(unique(carbon_sink_out_q$id))


carbononly <- pouta %>%
  left_join(poutaxy[,c("id", "year", "regime", "standid", "CARBON_STORAGE")], by = c("id", "year", "regime")) %>%
  select(c("id", "standid", "year", "regime", "V", "Harvested_V" ,"CARBON_STORAGE", "CARBON_SINK")) %>%
  filter(id %in% unique(carbon_sink_out$id))

carbononly %>%
  filter(CARBON_SINK < -1000000 | CARBON_SINK > 1000000)


carbonplot <- carbononly %>%
  filter(regime %in% c("SA","initial_state") & id %in% 99037593 |
           regime %in% c("BAU_30","initial_state") & id %in% 99034584 |
           regime %in% c("CCF_2", "CCF_1", "CCF_3", "initial_state") & id %in% 99032541) %>%
  mutate(deltaCarbonstorage = round(CARBON_SINK / 3.67, digits = 2))

# for scenarios BAU_30 and SA
p1 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, V)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90))

p2 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, CARBON_STORAGE)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

p3 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, deltaCarbonstorage)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

ggarrange(p1, p2, p3,
          top = "BAU_30 99034584 standid: 9932854 - SA 99037593	standid: 10628286")


# for scenarios CC_2, _1 and _3
p4 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, V)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90))

p5 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, CARBON_STORAGE)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

p6 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, deltaCarbonstorage)) +
  geom_line() +
  facet_grid(.~regime) +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

ggarrange(p4, p5, p6,
          top = "99032541 - standid 23271031")


boxplot(pouta$Recreation, main="Recreation")

recreation_out <- pouta %>%  filter(Recreation < 0) %>%  # values below 0
  mutate(error = "Recreation")
length(unique(recreation_out$id))

recreation_out_q <- pouta %>% filter(Recreation < quantile(Recreation,0.0005,na.rm = TRUE))
length(unique(recreation_out_q$id))


boxplot(pouta$Scenic, main="Scenic")

scenic_out <- pouta %>% filter(Scenic < 0) %>%  # values below 0
  mutate(error = "Scenic")
length(unique(scenic_out$id))

scenic_out_q <- pouta %>% filter(Scenic < quantile(Scenic,0.0005,na.rm = TRUE))
length(unique(scenic_out_q$id))


out_all <- rbind(V_out, i_Vm3_out, Harvested_V_out, mushroom_out, deadwood_out, llt_out, carbon_sink_out
                 #,scenic_out, recreation_out
                 ) %>%
  left_join(poutaxy[,c("id", "year", "regime", "standid")], by = c("id", "year", "regime"))
length(unique(out_all$id))


length(unique(V_out$id)) +
  length(unique(i_Vm3_out$id)) +
  length(unique(Harvested_V_out$id)) +
  length(unique(mushroom_out$id)) +
  length(unique(deadwood_out$id)) +
  length(unique(llt_out$id))+
  length(unique(carbon_sink_out$id)) - length(unique(out_all$id))


out_all %>%
  group_by(standid, error) %>%
  count(error) %>%
  spread(error, n)


# 5 indicators are: ALL_MARKETED_MUSHROOMS, Harvested_V, i_Vm3, N_where_D_gt_40, V
unique(out_all[out_all$standid == 14505106,]$regime)

# How does the input look like?

# 1;14505106;498288.75825802;6917281.65776072;1.228;2018-08-05;1;2;3;60;9;T1;1984;-1;1;1;1;1;0;0;1;1;4
# 2;2234390572;1;2;8;0;1800;0;1.14;;;;;;;;;;;;;;
# 2;2234390573;1;29;5;0;6240;0;1.3;;;;;;;;;;;;;;


# 4 indicators are: CARBON_SINK, Harvested_V, i_Vm3, V
unique(out_all[out_all$standid == 23271031,]$regime)

# How does the input look like?

# 1;23271031;549478.4914827;7027894.3495342;0.104;2019-09-01;1;1;2;20;1;S0;-1;-1;1;1;1;1;0;0;1;1;4
# 2;2282172490;3;4;92;5.11;69;30.73;21.12;;;;;;;;;;;;;;
# 2;2282172491;3;5;92;2.06;18;38.8;21.12;;;;;;;;;;;;;;
# 2;2282172492;2;2;18;0.48;391;3.94;5.6;;;;;;;;;;;;;;
# 2;2282172493;2;9;12;4.57;2249;5.09;3.64;;;;;;;;;;;;;;


unique(out_all[out_all$error %in% c("i_Vm3", "ALL_MARKEDET_MUSHROOMS"),]$regime)


# # Use a subsample or all data
# standid <- unique(pouta$id)
# standidsample <- sample(standid, 100)
# rslt.final <- pouta %>% filter(id %in% standidsample)


# Use all data
# remove the ID that have a match in out_all
rslt.final <- pouta %>%
  # filter(id != unique(out_all$id)) # does not work!?!?
  anti_join(out_all, by = "id")

length(unique(rslt.final$id))


# percentage of broadleave contains still NA ! Remove it.
rslt.final$prc_V_deciduous[is.na(rslt.final$prc_V_deciduous)] <- 0


# Add the geographical region (South, Central, North, Lappland)


if(on_cPouta == TRUE) {
  # for pouta
  province <- read.csv(paste0(path,"params/provinceInfo/provinces2.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)

} else {
  # for Windows
  province <- read.csv("C:/MyTemp/r_SIMO_output/params/provinceInfo/provinces2.csv", sep = ";", header = TRUE, stringsAsFactors = FALSE)

}

province <- province %>% rename(region = code_metsa)

rslt.final <- rslt.final %>%
  left_join(province[,c("region", "NFIGrid")], by= "region")


rslt.final[, "NFIGrid"] <- as.factor(rslt.final[, "NFIGrid"])
rslt.final[, "NFIGrid"] <- factor(rslt.final[, "NFIGrid"], levels = c("South", "Central", "North", "Lapland"))