Something went wrong on our end
check_opt_data.Rmd 18.30 KiB
title: "check_opt_data"
author: "CB"
date: "7/28/2020"
output: html_document
RCP0 whole Finland - version 7
(rslt_RCP0_FIN_V7.csv)
knitr::opts_chunk$set(echo = TRUE)
### Running on cPouta or WIN local
on_cPouta <- TRUE
### Set path to input files
if(on_cPouta == TRUE) {
# for pouta
datapath <- "/media/volume/outp_rcp0_test/"
path <- "/home/ubuntu/workspace/MultiForest2/"
# path <- paste0(getwd(),"/")
} else {
# for Windows
path <- "C:/MyTemp/r_SIMO_output/output/MF_FIN/final_RCP0_FIN_V3/"
}
### load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(egg)
poutaxy <- read.csv(paste0(datapath, "rslt_RCP0_FIN_V7_XY.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
pouta <- read.csv(paste0(datapath, "rslt_RCP0_FIN_V7.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#poutaxy <- read.csv(paste0(datapath, "rslt_test_XY.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#pouta <- read.csv(paste0(datapath, "rslt_test.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
How many stands?
length(unique(pouta$id))
Simulated regimes
unique(pouta$regime)
Quick summary of the data
summary(pouta)
Check for outliers of certain indicators
Based on summary, the following are looked at due to high max-min values:
- V
- i_Vm3
- Harvested_V (not under bark volumes and Biomass)
- ALL_MARKETED_MUSHROOMS
- V_total_deadwood
- N_where_D_gt_40
- prc_V_deciduous
- CARBON SINK
- Recreation
- Scenic
Indicator: V
boxplot(pouta$V, main="Indicator: V")
V_out <- pouta %>% filter(V > 1500) %>% # Standing volume above xx m3/ha
mutate(error = "V")
length(unique(V_out$id))
V_out_q <- pouta %>% filter(V > quantile(V,0.9995))
length(unique(V_out_q$id))
Indicator: i_Vm3
boxplot(pouta$i_Vm3, main="Indicator: i_Vm3")
i_Vm3_out <- pouta %>% filter(i_Vm3 > 50) %>% # annual volume increment above xx m3/ha
mutate(error = "i_Vm3")
length(unique(i_Vm3_out$id))
i_Vm3_out_q <- pouta %>% filter(i_Vm3 > quantile(i_Vm3,0.9995))
length(unique(i_Vm3_out_q$id))
Indicator: Harvested_V
plotharvest <- pouta %>% filter(Harvested_V != 0)
boxplot(plotharvest$Harvested_V, main="Harvested_V")
Harvested_V_out <- pouta %>% filter(Harvested_V > 1000) %>% # Harvested volume above xx m3/ha
mutate(error = "Harvested_V")
length(unique(Harvested_V_out$id))
Harvested_V_out_q <- pouta %>% filter(Harvested_V > quantile(Harvested_V,0.9995))
length(unique(Harvested_V_out_q$id))
Indicator: ALL_MARKETED_MUSHROOMS
boxplot(pouta$ALL_MARKETED_MUSHROOMS, main="ALL_MARKETED_MUSHROOMS" )
plot_mushroom <- pouta %>%
filter(ALL_MARKETED_MUSHROOMS < 10000) # filter out the extrem high one for plotting
boxplot(plot_mushroom$ALL_MARKETED_MUSHROOMS, main="ALL_MARKETED_MUSHROOMS without main outlier")
mushroom_out <- pouta %>% filter(ALL_MARKETED_MUSHROOMS > 700) %>% # Mushrooms above xx kg/ha
mutate(error = "ALL_MARKEDET_MUSHROOMS")
length(unique(mushroom_out$id))
mushroom_out_q <- pouta %>% filter(ALL_MARKETED_MUSHROOMS > quantile(ALL_MARKETED_MUSHROOMS,0.9995))
length(unique(mushroom_out_q$id))
Indicator: V_total_deadwood
boxplot(pouta$V_total_deadwood, main="V_total_deadwood")
deadwood_out <- pouta %>% filter(V_total_deadwood > 700) %>% # Volume of Deadwood above xx m3/ha
mutate(error = "V_total_deadwood")
length(unique(deadwood_out$id))
deadwood_out_q <- pouta %>% filter(V_total_deadwood > quantile(V_total_deadwood,0.9995))
length(unique(deadwood_out_q$id))
Indicator: N_where_D_gt_40
boxplot(pouta$N_where_D_gt_40, main="N_where_D_gt_40")
llt_out <- pouta %>% filter(N_where_D_gt_40 > 500) %>% # Number of big trees (> 40) above xx N/ha
mutate(error = "N_where_D_gt_40")
length(unique(llt_out$id))
llt_out_q <- pouta %>% filter(N_where_D_gt_40 > quantile(N_where_D_gt_40,0.9995))
length(unique(llt_out_q$id))
Indicator: prc_V_deciduous
ERROR is fixed! In upcoming data, it will not exist anymore
boxplot(pouta$prc_V_deciduous, main="prc_V_deciduous")
Indicator: CARBON SINK
plot_carbon <- pouta %>%
filter(CARBON_SINK > -5000000)
boxplot(plot_carbon$CARBON_SINK, main="CARBON_SINK without the huge outlier")
carbon_sink_out <- pouta %>% filter(CARBON_SINK < -1000000 | CARBON_SINK > 1000000) %>% # source/sink of more than 1000 t CO2/ha
mutate(error = "CARBON_SINK")
length(unique(carbon_sink_out$id))
unique(carbon_sink_out$regime)
carbon_sink_out_q <- pouta %>% filter(CARBON_SINK < quantile(CARBON_SINK,0.0005,na.rm = TRUE) | CARBON_SINK > quantile(CARBON_SINK,0.9995,na.rm = TRUE))
length(unique(carbon_sink_out_q$id))
What are the high values
carbononly <- pouta %>%
left_join(poutaxy[,c("id", "year", "regime", "standid", "CARBON_STORAGE")], by = c("id", "year", "regime")) %>%
select(c("id", "standid", "year", "regime", "V", "Harvested_V" ,"CARBON_STORAGE", "CARBON_SINK")) %>%
filter(id %in% unique(carbon_sink_out$id))
carbononly %>%
filter(CARBON_SINK < -1000000 | CARBON_SINK > 1000000)
Plot some example stands
carbonplot <- carbononly %>%
filter(regime %in% c("SA","initial_state") & id %in% 99037593 |
regime %in% c("BAU_30","initial_state") & id %in% 99034584 |
regime %in% c("CCF_2", "CCF_1", "CCF_3", "initial_state") & id %in% 99032541) %>%
mutate(deltaCarbonstorage = round(CARBON_SINK / 3.67, digits = 2))
# for scenarios BAU_30 and SA
p1 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, V)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90))
p2 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, CARBON_STORAGE)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
p3 = ggplot(carbonplot[carbonplot$regime %in% c("BAU_30", "SA"),], aes(year, deltaCarbonstorage)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
ggarrange(p1, p2, p3,
top = "BAU_30 99034584 standid: 9932854 - SA 99037593 standid: 10628286")
!!! HIGH Carbon sinks due to extremly high simulated Volumes !!!
# for scenarios CC_2, _1 and _3
p4 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, V)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90))
p5 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, CARBON_STORAGE)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
p6 = ggplot(carbonplot[carbonplot$regime %in% c("CCF_2", "CCF_1", "CCF_3"),], aes(year, deltaCarbonstorage)) +
geom_line() +
facet_grid(.~regime) +
theme(axis.text.x = element_text(angle = 90)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
ggarrange(p4, p5, p6,
top = "99032541 - standid 23271031")
!!! HIGH Carbon sinks due to extremly high simulated Volumes !!!
Indicator: Recreation
boxplot(pouta$Recreation, main="Recreation")
recreation_out <- pouta %>% filter(Recreation < 0) %>% # values below 0
mutate(error = "Recreation")
length(unique(recreation_out$id))
recreation_out_q <- pouta %>% filter(Recreation < quantile(Recreation,0.0005,na.rm = TRUE))
length(unique(recreation_out_q$id))
Indicator: Scenic
boxplot(pouta$Scenic, main="Scenic")
scenic_out <- pouta %>% filter(Scenic < 0) %>% # values below 0
mutate(error = "Scenic")
length(unique(scenic_out$id))
scenic_out_q <- pouta %>% filter(Scenic < quantile(Scenic,0.0005,na.rm = TRUE))
length(unique(scenic_out_q$id))
Combine all excluded cases (based on the thresholds)
How many stands are effected? (Recreation and Scenic not included!)
out_all <- rbind(V_out, i_Vm3_out, Harvested_V_out, mushroom_out, deadwood_out, llt_out, carbon_sink_out
#,scenic_out, recreation_out
) %>%
left_join(poutaxy[,c("id", "year", "regime", "standid")], by = c("id", "year", "regime"))
length(unique(out_all$id))
How many stands have multiple times be filtered out?
length(unique(V_out$id)) +
length(unique(i_Vm3_out$id)) +
length(unique(Harvested_V_out$id)) +
length(unique(mushroom_out$id)) +
length(unique(deadwood_out$id)) +
length(unique(llt_out$id))+
length(unique(carbon_sink_out$id)) - length(unique(out_all$id))
Which stands caused unlogical values for several indicators?
out_all %>%
group_by(standid, error) %>%
count(error) %>%
spread(error, n)
Example stand that caused errors for 5 indicators
Which regimes?
# 5 indicators are: ALL_MARKETED_MUSHROOMS, Harvested_V, i_Vm3, N_where_D_gt_40, V
unique(out_all[out_all$standid == 14505106,]$regime)
# How does the input look like?
# 1;14505106;498288.75825802;6917281.65776072;1.228;2018-08-05;1;2;3;60;9;T1;1984;-1;1;1;1;1;0;0;1;1;4
# 2;2234390572;1;2;8;0;1800;0;1.14;;;;;;;;;;;;;;
# 2;2234390573;1;29;5;0;6240;0;1.3;;;;;;;;;;;;;;
Example stand that caused errors for 4 indicators
Which regimes?
# 4 indicators are: CARBON_SINK, Harvested_V, i_Vm3, V
unique(out_all[out_all$standid == 23271031,]$regime)
# How does the input look like?
# 1;23271031;549478.4914827;7027894.3495342;0.104;2019-09-01;1;1;2;20;1;S0;-1;-1;1;1;1;1;0;0;1;1;4
# 2;2282172490;3;4;92;5.11;69;30.73;21.12;;;;;;;;;;;;;;
# 2;2282172491;3;5;92;2.06;18;38.8;21.12;;;;;;;;;;;;;;
# 2;2282172492;2;2;18;0.48;391;3.94;5.6;;;;;;;;;;;;;;
# 2;2282172493;2;9;12;4.57;2249;5.09;3.64;;;;;;;;;;;;;;
ALL_MARKETED_MUSHROOMS occured often in combination with i_Vm3
Under which regimes?
unique(out_all[out_all$error %in% c("i_Vm3", "ALL_MARKEDET_MUSHROOMS"),]$regime)
Plot the individual indicators over the time
# # Use a subsample or all data
# standid <- unique(pouta$id)
# standidsample <- sample(standid, 100)
# rslt.final <- pouta %>% filter(id %in% standidsample)
# Use all data
# remove the ID that have a match in out_all
rslt.final <- pouta %>%
# filter(id != unique(out_all$id)) # does not work!?!?
anti_join(out_all, by = "id")
length(unique(rslt.final$id))
# percentage of broadleave contains still NA ! Remove it.
rslt.final$prc_V_deciduous[is.na(rslt.final$prc_V_deciduous)] <- 0
# Add the geographical region (South, Central, North, Lappland)
if(on_cPouta == TRUE) {
# for pouta
province <- read.csv(paste0(path,"params/provinceInfo/provinces2.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
} else {
# for Windows
province <- read.csv("C:/MyTemp/r_SIMO_output/params/provinceInfo/provinces2.csv", sep = ";", header = TRUE, stringsAsFactors = FALSE)
}
province <- province %>% rename(region = code_metsa)
rslt.final <- rslt.final %>%
left_join(province[,c("region", "NFIGrid")], by= "region")
rslt.final[, "NFIGrid"] <- as.factor(rslt.final[, "NFIGrid"])
rslt.final[, "NFIGrid"] <- factor(rslt.final[, "NFIGrid"], levels = c("South", "Central", "North", "Lapland"))