From 4bffcd6c0596c2121c080612227138671f02c7e4 Mon Sep 17 00:00:00 2001
From: Margarete Walden <maw32@humboldt.edu>
Date: Tue, 27 Aug 2024 23:24:32 -0700
Subject: [PATCH] Replace sim_functions.R: to latest version.

---
 chin_sim_image/scripts/sim_functions.R | 1047 +++++++++++++-----------
 1 file changed, 580 insertions(+), 467 deletions(-)

diff --git a/chin_sim_image/scripts/sim_functions.R b/chin_sim_image/scripts/sim_functions.R
index 59e2c11..60d25c8 100644
--- a/chin_sim_image/scripts/sim_functions.R
+++ b/chin_sim_image/scripts/sim_functions.R
@@ -1,467 +1,580 @@
-####----ABOUT---------------------------------------------------------------####
-####   
-####   Evaluating alternative methods for modeling trap         
-####   efficiencies of outmigrating juvenile salmonids          
-####
-####   Authors:  Walden et al.                               
-####
-####   Content: R-Code for functions used in simulation      
-####
-####   This is the file with all functions needed to run the
-####   simulation shown in Section X of the manuscript.
-####
-
-####
-####----1. Function to generate beta-distributed data-----------------------####
-####
-
-## This uses names of variables declared in "scripts/sim_variables.R"
-simulateDataset <- function(nobs, type=c("train", "test", "tune"), 
-                            efficiency=c("Max efficiency 0.1", "Max efficiency 0.25"), 
-                            seeds=NA) {
-  
-  # Get number of observations depending on dataset type
-  thisnobs <- ifelse(type == "test", ntest, nobs)
-  
-  ## Create vector of parameter values
-  Beta <- NA
-  Beta0 <- ifelse(efficiency == "Max efficiency 0.1", Beta0_maxpt1, Beta0_maxpt25)
-  Beta[1] <- Beta1
-  Beta[2] <- Beta2
-  Beta[3] <- Beta3
-  Beta[4] <- Beta4
-  Beta[5] <- Beta5
-  Beta[6] <- Beta6
-  Beta[7] <- 1          # needed for year random effect
-  
-  # While-loop checks that generated dataset y values are all less than max 
-  # trap efficiency
-  thiscondition <- TRUE
-  while(thiscondition) {
- 
-    # If seeds aren't provided, then generate seeds, otherwise retrieve
-    # seeds.
-    if(any(is.na(seeds))) {
-      seed1to6 <- round(runif(6, 1, 999999999),0)
-      # While loop checks that there is at least one obs per random effect
-      # group
-      thiscond2 <- TRUE
-      while (thiscond2) {
-        seedyr <- round(runif(1, 1, 999999999),0)
-        set.seed(seedyr)
-        year <- sample.int(nyear, size=thisnobs, replace=TRUE)
-        thiscond2 <- length(unique(year)) != nyear
-      } # end while loop
-    } else {
-      seed1to6 <- as.numeric(seeds[1:6])
-      seedyr <- as.numeric(seeds[7])
-      set.seed(seedyr)
-      year <- sample.int(nyear, size=thisnobs, replace=TRUE)
-    } # end if-else loop
-    
-    # Create fixed-effects covariate vectors
-    set.seed(seed1to6[2])
-    Q <- runif(thisnobs, Qmin, Qmax)
-    Q <- scale(Q)
-    set.seed(seed1to6[3])
-    C <- runif(thisnobs, degCmin, degCmax)
-    C <- scale(C)
-    set.seed(seed1to6[4])
-    effort <- runif(thisnobs, effortmin, effortmax)
-    effort <- scale(effort)
-    set.seed(seed1to6[5])
-    precip <- sample(c(0, 1), size=thisnobs, replace=TRUE)    # yes-no
-    
-    # Create random effects, one for each group
-    set.seed(seed1to6[1])
-    yearRE <- rnorm(nyear, 0, raneffSD)
-
-    # Create matrix of covariates
-    covariates<-matrix(c(Q, C, effort, precip, 
-                         year), nrow=thisnobs, ncol=5)    
-
-    # Create design matrix
-    designMat<-cbind(covariates[,1], covariates[,2], covariates[,3], 
-                     covariates[,4], 
-                     covariates[,3] * covariates[,3],  
-                     covariates[,1] * covariates[,2],
-                     yearRE[covariates[,5]] + Beta0)
-    
-    # Calculate mean mu 
-    lin.mu <- designMat%*%Beta
-    mu <- inverselogit(lin.mu)
-
-    # Calculate beta distribution parameters
-    a <- mu / phiTrue
-    b <- (1 - mu) / phiTrue
-  
-    # Realized y
-    set.seed(seed1to6[6])
-    y <- rbeta(thisnobs, a, b)
-    
-    # Check realized y to exit while loop
-    thiscondition <- ifelse(efficiency == "Max efficiency 0.1", 
-                            length(which(y > 0.1)) > 0, 
-                            length(which(y > 0.25)) > 0)
-  } # Exit while loop
-  
-  # Collate output
-  seedcol <- as.numeric(c(seed1to6, seedyr, rep(NA, thisnobs-7)))
-  results <- cbind(y, covariates, a, b, seedcol)
-  return(results)
-} # End function
-
-####
-####----2. Function to create, compile, and run Nimble model----------------####
-####
-
-run_MCMC_allcode <- function(model, data, monitors, inits, niter, nburnin) {
-  
-  myModel <- readBUGSmodel(
-    model,
-    data=data,
-    buildDerivs=FALSE
-  )
-  
-  CmyModel <- compileNimble(myModel)
-  myMCMC <- buildMCMC(CmyModel, monitors=monitors)
-  CmyMCMC <- compileNimble(myMCMC)
-  
-  results <- runMCMC(
-    CmyMCMC,
-    inits=inits,
-    niter=niter,
-    nburnin=nburnin,
-    thin=10
-  )
-  
-  return(results)
-}
-
-####
-####----3. Function to parallelize Nimble MCMC chains-----------------------####
-####
-
-do <- function(cores, pop, fun, ...){
-  require(doParallel)
-  require(foreach)
-  require(nimble)
-  cl <- makeCluster(cores)
-  registerDoParallel(cl)
-  results <- foreach(pop, 
-                     .packages=c("nimble")) %dopar% fun(...)
-  stopCluster(cl)
-  return(results)
-}
-
-####
-####----4. Calculate f-statistic: proportion of mcmc chain with same -------####
-####       sign as the mean
-####
-
-fstat <- function(x) {
-  if(mean(x) < 0) {
-    temp <- mean(x < 0, na.rm=TRUE)
-    return(temp)
-  }
-  if(mean(x) > 0) {
-    temp <- mean(x > 0, na.rm=TRUE)
-    return(temp)
-  }
-  if(mean(x) == 0) {
-    return(0)
-  }
-}
-
-####
-####----5. Calculate log-likelihood for beta-distributed variable-----------####
-####       (Weinhold et al. 2020)
-####
-
-betaLogLik <- function(y, mu, phi) {
-  lgamma(phi) - lgamma(mu*phi) - lgamma((1 - mu)*phi) + (mu*phi - 1)*log(y) + 
-    ((1 - mu)*phi - 1)*log(1 - y)
-}
-
-####
-####----6. Loss function for beta-distributed variable used in MLE----------####
-####       (Weinhold et al. 2020)
-####
-
-betaLoss <- function(par, y, mu) {
-  -sum(lgamma(par) - lgamma(mu*par) - lgamma((1 - mu)*par) + 
-         (mu*par - 1)*log(y) + ((1 - mu)*par - 1)*log(1 - y))
-}
-
-####
-####----7. Get predictions from ranger random forest object-----------------####
-####----   when pdp or iml packages used
-
-pred.fun.ranger <- function(object, newdata) { 
-  results <- predict(object, newdata)$predictions
-  return(results)
-}
-
-####
-####----8. Get root mean squared error (RMSE) using beta distribution-------####
-####       Standardized residuals defined by Ferrari and Cribari-Neto 2004  ####
-
-rmse <- function(y_test, y_predicted, phi) {
-  std.residuals <- (y_test - y_predicted) / 
-    sqrt( (y_predicted * (1 - y_predicted)) / (1 + phi))
-  out <- sqrt(mean(std.residuals^2))
-  return(out)
-}
-
-####
-####----9. Parallel bootstrapping of boosted beta regression----------------####
-####                                                                        
-
-run_bootstrap <- function(nboot, mstopmu, mstopphi, maindata, mainfmu, 
-                          mainfphi, testdata, i, nyear) {
-  inverselogit <- function(alpha) {
-    exp(alpha) / ( exp(alpha) + 1 )
-  }
-  logit <- function(p) {
-    log(p/(1-p))
-  }
-  results <- matrix(data=NA, nrow=300, ncol=1)
-  thiscondition <- FALSE
-  while(thiscondition == FALSE){
-    bootingfunc <- function() {
-      tryCatch(
-        expr={
-              thiscond <- TRUE
-              while (thiscond) {
-                temp <- sample(1:i, i, replace=TRUE) # get index of obs to keep
-                bootdat <- maindata[temp, ] # get dataset
-                thiscond <- length(unique(bootdat$year)) != nyear
-              }
-              results <- FDboostLSS(list(mu=mainfmu, phi=y ~ 1), 
-                timeformula=NULL, families=BetaLSS(),  data=bootdat, 
-                control=boost_control(mstop=c(mu=mstopmu, phi=mstopphi), nu=0.1))
-              return(results)
-        }, error=function(error_message){
-          out <- error_message
-          return(out)
-        }
-      )
-    }
-    modelBoot <- bootingfunc()
-    if(any(class(modelBoot) == "FDboostLSS")) {
-        thiscondition <- TRUE
-    } else {
-      temp <- paste(as.character(modelBoot))
-      temp <- str_sub(word(temp,-1),end=-2)
-      mstopmu <- as.numeric(temp) - 1
-    }
-  }
-
-  # Keep mstop values
-  results[20,] <- mstop(modelBoot)[[1]]
-  results[21,] <- mstop(modelBoot)[[2]]
-    
-  #  #B1 on link scale (logit)
-  results[1,] <- coef(modelBoot, raw=TRUE, which=2, parameter="mu")[[1]]
-  #  #B2 on link scale (logit)
-  results[2,] <- coef(modelBoot, raw=TRUE, which=3, parameter="mu")[[1]]
-  #  #B3 on link scale (logit)
-  results[3,] <- coef(modelBoot, raw=TRUE, which=4, parameter="mu")[[1]]
-  #  #B4 on link scale (logit)
-  results[4,] <- coef(modelBoot, raw=TRUE, which=5, parameter="mu")[[1]]
-  #  #B5 on link scale (logit)
-  x <- coef(modelBoot)$mu$smterms$`bbs(effort)`$x
-  y <- coef(modelBoot)$mu$smterms$`bbs(effort)`$value[,1]
-  thisdat <- data.frame("x"=x, "y"=y)
-  thismod <- lm(y ~ I(x^2), data=thisdat)
-  results[5,] <- as.numeric(thismod$coefficients[2])
-  #  #B6 on link scale (logit)
-  results[6,] <- coef(modelBoot, raw=TRUE, which=7, parameter="mu")[[1]]
-  # #Phi intercept
-  results[11,] <- 1 / exp(modelBoot$phi$offset +
-                        as.vector(coef(modelBoot)$phi$intercept))
-  #B0 on link scale (logit)
-  results[10,] <- modelBoot$mu$offset + as.vector(coef(modelBoot)$mu$intercept)
-  # yr.sd
-  results[7,] <- sd(as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`))
-  
-  # prediction on test dataset using bootstrapped model
-  testset <- testdata
-  oldw <- getOption("warn")
-  options(warn=-1)
-  temp <- predict(modelBoot, newdata=testset, type="response")
-  options(warn=oldw)
-  testset$predy <- temp$mu
-  testset$predphi <- 1 / temp$phi
-  temp2 <- as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`)
-  for(m in 1:nrow(testset)){
-    testset$raneff[m] <- temp2[testset$year[m]]
-    testset$predy2[m] <- inverselogit(logit(testset$predy[m]) - testset$raneff[m])
-    testset$yhat[m] <-rbeta(1, testset$predy2[m] / testset$predphi,
-                            (1 - testset$predy2[m]) / testset$predphi)
-  }
-    
-  results[100:199,] <- testset$yhat
-  results[200:299,] <- testset$predphi
-  return(results)
-}
-
-####
-####----9.1. Bootstrapping with out-of-sample prediction--------------------####
-####                                                                        
-
-run_bootstrap_oos <- function(nboot, mstopmu, mstopphi, maindata, mainfmu, 
-  mainfphi, testdata, i, nyear) {
-  inverselogit <- function(alpha) {
-    exp(alpha) / ( exp(alpha) + 1 )
-  }
-  logit <- function(p) {
-    log(p/(1-p))
-  }
-  results <- matrix(data=NA, nrow=300, ncol=1)
-  thiscondition <- FALSE
-  while(thiscondition == FALSE){
-    bootingfunc <- function() {
-      tryCatch(
-        expr={
-          thiscond <- TRUE
-          while (thiscond) {
-            keepindex <- sample(1:i, i, replace=TRUE) # get index of obs to keep
-            bootdat <- maindata[keepindex, ] # get dataset
-            thiscond <- length(unique(bootdat$year)) != nyear
-          }
-          results <- FDboostLSS(list(mu=mainfmu, phi=y ~ 1), 
-            timeformula=NULL, 
-            families=BetaLSS(), 
-            data=bootdat, 
-            control=boost_control(mstop=c(mu=mstopmu, 
-              phi=mstopphi), 
-              nu=0.1))
-          return(list(results,keepindex))
-        }, error=function(error_message){
-          out <- error_message
-          return(out)
-        }
-      )
-    }
-    modelBoot <- bootingfunc()
-    if(any(class(modelBoot) == "error")) {
-      temp <- paste(as.character(modelBoot))
-      temp <- str_sub(word(temp,-1),end=-2)
-      mstopmu <- as.numeric(temp) - 1
-    } else {
-      keepindex <- modelBoot[[2]]
-      modelBoot <- modelBoot[[1]]
-      thiscondition <- TRUE
-    }
-  }
-  
-  # Keep mstop values
-  results[20,] <- mstop(modelBoot)[[1]]
-  results[21,] <- mstop(modelBoot)[[2]]
-  
-  #  #B1 on link scale (logit)
-  results[1,] <- coef(modelBoot, raw=TRUE, which=2, parameter="mu")[[1]]
-  #  #B2 on link scale (logit)
-  results[2,] <- coef(modelBoot, raw=TRUE, which=3, parameter="mu")[[1]]
-  #  #B3 on link scale (logit)
-  results[3,] <- coef(modelBoot, raw=TRUE, which=4, parameter="mu")[[1]]
-  #  #B4 on link scale (logit)
-  results[4,] <- coef(modelBoot, raw=TRUE, which=5, parameter="mu")[[1]]
-  #  #B5 on link scale (logit)
-  x <- coef(modelBoot)$mu$smterms$`bbs(effort)`$x
-  y <- coef(modelBoot)$mu$smterms$`bbs(effort)`$value[,1]
-  thisdat <- data.frame("x"=x, "y"=y)
-  thismod <- lm(y ~ I(x^2), data=thisdat)
-  results[5,] <- as.numeric(thismod$coefficients[2])
-  #  #B6 on link scale (logit)
-  results[6,] <- coef(modelBoot, raw=TRUE, which=7, parameter="mu")[[1]]
-  # #Phi intercept
-  results[11,] <- 1 / exp(modelBoot$phi$offset +
-      as.vector(coef(modelBoot)$phi$intercept))
-  #B0 on link scale (logit)
-  results[10,] <- modelBoot$mu$offset + as.vector(coef(modelBoot)$mu$intercept)
-  # yr.sd
-  results[7,] <- sd(as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`))
-  
-  # prediction on test dataset using bootstrapped model
-  dontkeep <- unique(keepindex)
-  testset <- maindata
-  maindata$year <- as.numeric(as.factor(maindata$year))
-  oldw <- getOption("warn")
-  options(warn=-1)
-  temp <- predict(modelBoot, type="response")
-  options(warn=oldw)
-  maindata$predy <- temp$mu
-  maindata$predphi <- 1 / temp$phi
-  temp2 <- as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`)
-  for(m in 1:nrow(maindata)){
-    maindata$raneff[m] <- temp2[maindata$year[m]]
-    maindata$predy2[m] <- inverselogit(logit(maindata$predy[m]) - maindata$raneff[m])
-    maindata$yhat[m] <-rbeta(1, maindata$predy2[m] / maindata$predphi,
-      (1 - maindata$predy2[m]) / maindata$predphi)
-  }
-  
-  maindata$yhat[dontkeep] <- NA
-  st <- nrow(maindata) + 100 - 1
-  results[100:st,] <- maindata$yhat
-  return(results)
-}
-
-####
-####----9.5 Function to parallelize Bootstrapping---------------------------####
-####                                                                        
-doboot <- function(numcores, pop, fun, ...){
-  require(doParallel)
-  require(foreach)
-  require(gamboostLSS) 
-  require(FDboost)
-  require(stringr)
-  cl <- makeCluster(numcores)
-  registerDoParallel(cl)
-  # clusterExport(cl, varlist=c("FUN", "a", "b"))
-  results <- foreach(pop, .combine=cbind,
-                     .packages=c("gamboostLSS", 
-                                   "FDboost", 
-                                   "stringr")) %dopar% fun(...)
-  stopCluster(cl)
-  return(results)
-}
-
-####
-####----10. Inverse-logit function------------------------------------------####
-
-inverselogit <- function(alpha) {
-  exp(alpha) / ( exp(alpha) + 1 )
-}
-
-####
-####----11. logit function------------------------------------------####
-
-logit <- function(p) {
-  log(p/(1-p))
-}
-
-
-####
-####----12. Parallelize cvrisk() function-----------------------------------####
-
-myApply <- function(dataset, FDboostLSS, ...){
-  myFun <- function(...) {
-    library("mboost")
-    FDboostLSS(...)
-  }
-  parLapply(cl=cl, dataset, myFun, ...)
-}
-
-
-####
-####----13. Customized predictor function (source: iml package at ----------####
-####        https://github.com/giuseppec/iml/blob/main/R/Predictor.R
-
-# This adds "regression" as the task for the iml:Predictor function. Otherwise
-# the task is "unknown" and it throws an error.
-
-inferTaskFromModel.ranger <- function(model) {
-  return("regression")
-}
\ No newline at end of file
+####----ABOUT---------------------------------------------------------------####
+####   
+####   Evaluating alternative methods for modeling trap         
+####   efficiencies of outmigrating juvenile salmonids          
+####
+####   Authors:  Walden et al.                               
+####
+####   Content: R-Code for functions used in simulation      
+####
+####   This is the file with all functions needed to run the
+####   simulation shown in Section X of the manuscript.
+####
+
+####
+####----1. Function to generate beta-distributed data-----------------------####
+####
+
+## This uses names of variables declared in "scripts/sim_variables.R"
+simulateDataset <- function(nobs, type=c("train", "test", "tune"), 
+                            efficiency=c("Max efficiency 0.1", "Max efficiency 0.25"), 
+                            seeds=NA) {
+  
+  # Get number of observations depending on dataset type
+  thisnobs <- ifelse(type == "test", ntest, nobs)
+  
+  ## Create vector of parameter values
+  Beta <- NA
+  Beta0 <- ifelse(efficiency == "Max efficiency 0.1", Beta0_maxpt1, Beta0_maxpt25)
+  Beta[1] <- Beta1
+  Beta[2] <- Beta2
+  Beta[3] <- Beta3
+  Beta[4] <- Beta4
+  Beta[5] <- Beta5
+  Beta[6] <- Beta6
+  Beta[7] <- 1          # needed for year random effect
+  
+  # While-loop checks that generated dataset y values are all less than max 
+  # trap efficiency
+  thiscondition <- TRUE
+  while(thiscondition) {
+ 
+    # If seeds aren't provided, then generate seeds, otherwise retrieve
+    # seeds.
+    if(any(is.na(seeds))) {
+      seed1to6 <- round(runif(6, 1, 999999999),0)
+      # While loop checks that there is at least one obs per random effect
+      # group
+      thiscond2 <- TRUE
+      while (thiscond2) {
+        seedyr <- round(runif(1, 1, 999999999),0)
+        set.seed(seedyr)
+        year <- sample.int(nyear, size=thisnobs, replace=TRUE)
+        thiscond2 <- length(unique(year)) != nyear
+      } # end while loop
+    } else {
+      seed1to6 <- as.numeric(seeds[1:6])
+      seedyr <- as.numeric(seeds[7])
+      set.seed(seedyr)
+      year <- sample.int(nyear, size=thisnobs, replace=TRUE)
+    } # end if-else loop
+    
+    # Create fixed-effects covariate vectors
+    set.seed(seed1to6[2])
+    Q <- runif(thisnobs, Qmin, Qmax)
+    Q <- scale(Q)
+    set.seed(seed1to6[3])
+    C <- runif(thisnobs, degCmin, degCmax)
+    C <- scale(C)
+    set.seed(seed1to6[4])
+    effort <- runif(thisnobs, effortmin, effortmax)
+    effort <- scale(effort)
+    set.seed(seed1to6[5])
+    precip <- sample(c(0, 1), size=thisnobs, replace=TRUE)    # yes-no
+    
+    # Create random effects, one for each group
+    set.seed(seed1to6[1])
+    yearRE <- rnorm(nyear, 0, raneffSD)
+
+    # Create matrix of covariates
+    covariates<-matrix(c(Q, C, effort, precip, 
+                         year), nrow=thisnobs, ncol=5)    
+
+    # Create design matrix
+    designMat<-cbind(covariates[,1], covariates[,2], covariates[,3], 
+                     covariates[,4], 
+                     covariates[,3] * covariates[,3],  
+                     covariates[,1] * covariates[,2],
+                     yearRE[covariates[,5]] + Beta0)
+    
+    # Calculate mean mu 
+    lin.mu <- designMat%*%Beta
+    mu <- inverselogit(lin.mu)
+
+    # Calculate beta distribution parameters
+    a <- mu / phiTrue
+    b <- (1 - mu) / phiTrue
+  
+    # Realized y
+    set.seed(seed1to6[6])
+    y <- rbeta(thisnobs, a, b)
+    
+    # Check realized y to exit while loop
+    thiscondition <- ifelse(efficiency == "Max efficiency 0.1", 
+                            length(which(y > 0.1)) > 0, 
+                            length(which(y > 0.25)) > 0)
+  } # Exit while loop
+  
+  # Collate output
+  seedcol <- as.numeric(c(seed1to6, seedyr, rep(NA, thisnobs-7)))
+  results <- cbind(y, covariates, a, b, seedcol)
+  return(results)
+} # End function
+
+####
+####----2. Function to create, compile, and run Nimble model----------------####
+####
+
+run_MCMC_allcode <- function(model, data, monitors, inits, niter, nburnin) {
+  
+  myModel <- readBUGSmodel(
+    model,
+    data=data,
+    buildDerivs=FALSE
+  )
+  
+  CmyModel <- compileNimble(myModel)
+  myMCMC <- buildMCMC(CmyModel, monitors=monitors)
+  CmyMCMC <- compileNimble(myMCMC)
+  
+  results <- runMCMC(
+    CmyMCMC,
+    inits=inits,
+    niter=niter,
+    nburnin=nburnin,
+    thin=10
+  )
+  
+  return(results)
+}
+
+####
+####----3. Function to parallelize Nimble MCMC chains-----------------------####
+####
+
+do <- function(cores, pop, fun, ...){
+  require(doParallel)
+  require(foreach)
+  require(nimble)
+  cl <- makeCluster(cores)
+  registerDoParallel(cl)
+  results <- foreach(pop, 
+                     .packages=c("nimble")) %dopar% fun(...)
+  stopCluster(cl)
+  return(results)
+}
+
+####
+####----4. Calculate f-statistic: proportion of mcmc chain with same -------####
+####       sign as the mean
+####
+
+fstat <- function(x) {
+  if(mean(x) < 0) {
+    temp <- mean(x < 0, na.rm=TRUE)
+    return(temp)
+  }
+  if(mean(x) > 0) {
+    temp <- mean(x > 0, na.rm=TRUE)
+    return(temp)
+  }
+  if(mean(x) == 0) {
+    return(0)
+  }
+}
+
+####
+####----5. Calculate log-likelihood for beta-distributed variable-----------####
+####       (Weinhold et al. 2020)
+####
+
+betaLogLik <- function(y, mu, phi) {
+  lgamma(phi) - lgamma(mu*phi) - lgamma((1 - mu)*phi) + (mu*phi - 1)*log(y) + 
+    ((1 - mu)*phi - 1)*log(1 - y)
+}
+
+####
+####----6. Loss function for beta-distributed variable used in MLE----------####
+####       (Weinhold et al. 2020)
+####
+
+betaLoss <- function(par, y, mu) {
+  -sum(lgamma(par) - lgamma(mu*par) - lgamma((1 - mu)*par) + 
+         (mu*par - 1)*log(y) + ((1 - mu)*par - 1)*log(1 - y))
+}
+
+####
+####----7. Get predictions from ranger random forest object-----------------####
+####----   when pdp or iml packages used
+
+pred.fun.ranger <- function(object, newdata) { 
+  results <- predict(object, newdata)$predictions
+  return(results)
+}
+
+####
+####----8. Get root mean squared error (RMSE) using beta distribution-------####
+
+## Ferrari and Cribari-Neto 2004 standardized residuals
+
+rmse_fcn <- function(y_test, y_predicted, phi) {
+  numer <- y_test - y_predicted
+  tphi <- 1/phi
+  denom <- sqrt((y_predicted*(1-y_predicted))/(1 + tphi))
+  std.residuals <- numer/denom
+  out <- sqrt(mean(std.residuals^2))
+  return(out)
+}
+
+## Mean absolute error
+mae <- function(y_test, y_predicted) {
+  out <- mean(abs(y_predicted - y_test))
+  return(out)
+}
+
+## General RMSE
+rmse_g <- function(y_test, y_predicted) {
+  out <- sqrt(mean((y_predicted - y_test)^2))
+  return(out)
+}
+
+####
+####----9. Parallel bootstrapping of boosted beta regression----------------####
+####                                                                        
+
+run_bootstrap <- function(mstopmu, mstopphi, maindata, mainfmu, 
+                          mainfphi, testdata, kfoldB, i, nyear, nu, incores) {
+  
+  inverselogit <- function(alpha) {
+    exp(alpha) / ( exp(alpha) + 1 )
+  }
+  
+  logit <- function(p) {
+    log(p/(1-p))
+  }
+  
+  results <- matrix(data=NA, nrow=300, ncol=1)
+  
+  ## Get bootstrapped dataset
+  
+  thiscond <- TRUE
+  while (thiscond) {
+    temp <- sample(1:i, i, replace=TRUE) # get index of obs to keep
+    bootdat <- maindata[temp, ] # get dataset
+    thiscond <- length(unique(bootdat$year)) != nyear # make sure all groups are present
+  }
+  
+  ## Tune hyperparameters to bootstrap sample
+  
+  thiscondition <- TRUE
+  while(thiscondition==TRUE){
+    tuningfunc <- function() {
+      tryCatch(
+        expr={
+          bootune <-  FDboostLSS(list(mu=mainfmu, phi=mainfphi), timeformula=NULL, 
+                                   families=BetaLSS(), 
+                                   data=bootdat, 
+                                   control=boost_control(mstop=c(mu=mstopmu, 
+                                                                 phi=mstopphi),
+                                                         nu=nu))
+          cl <- makeCluster(incores)
+          myApply <- function(X, FUN, ...){
+            myFun <- function(...) {
+              library("FDboost")
+              tryCatch({
+                FUN(...)
+              }, error=function(e) {
+                out <- paste(e)
+                return(out)
+              })
+            }
+            parLapply(cl, X, myFun, ...)
+          }
+          resulttune <- cvrisk(bootune, papply=myApply,
+                           folds=mboost::cv(model.weights(bootune), type="kfold",
+                                            B=kfoldB),
+                           trace=FALSE)
+          stopCluster(cl)
+          return(resulttune)
+        }, error=function(e) {
+          result <- paste(e)
+          return(result)
+        }
+      )
+    }
+    cvn <- tuningfunc()
+    if(any(class(cvn) == "cvriskLSS")) {
+      thiscondition <- FALSE
+    } else {
+      # print(cvn)
+      temp <- paste(as.character(cvn))
+      temp <- min(as.numeric(unlist(strsplit(temp, split=" |\n"))), na.rm=TRUE)
+      mstopmu <- temp - 1
+      # print(mstopmu)
+    }
+  }
+  ### Finish tuning
+  mmu <- mstop(cvn)[[1]]
+  
+  thiscondition <- FALSE
+  while(thiscondition == FALSE){
+    bootingfunc <- function() {
+      tryCatch(
+        expr={
+              results <- FDboostLSS(list(mu=mainfmu, phi=y ~ 1), 
+                timeformula=NULL, 
+                families=BetaLSS(),  
+                data=bootdat, 
+                control=boost_control(mstop=c(mu=mmu, phi=mstop(cvn)[[2]]), nu=nu))
+              return(results)
+        }, error=function(error_message){
+          out <- error_message
+          return(out)
+        }
+      )
+    }
+    modelBoot <- bootingfunc()
+    if(any(class(modelBoot) == "FDboostLSS")) {
+        thiscondition <- TRUE
+    } else {
+      temp <- paste(as.character(modelBoot))
+      temp <- str_sub(word(temp,-1),end=-2)
+      mmu <- as.numeric(temp) - 1
+    }
+  }
+
+  # Keep mstop values
+  results[20,] <- mstop(modelBoot)[[1]]
+  results[21,] <- mstop(modelBoot)[[2]]
+    
+  #  #B1 on link scale (logit)
+  results[1,] <- coef(modelBoot, raw=TRUE, which=2, parameter="mu")[[1]]
+  #  #B2 on link scale (logit)
+  results[2,] <- coef(modelBoot, raw=TRUE, which=3, parameter="mu")[[1]]
+  #  #B3 on link scale (logit)
+  results[3,] <- coef(modelBoot, raw=TRUE, which=4, parameter="mu")[[1]]
+  #  #B4 on link scale (logit)
+  results[4,] <- coef(modelBoot, raw=TRUE, which=5, parameter="mu")[[1]]
+  #  #B5 on link scale (logit)
+  x <- coef(modelBoot)$mu$smterms$`bbs(effort)`$x
+  y <- coef(modelBoot)$mu$smterms$`bbs(effort)`$value[,1]
+  thisdat <- data.frame("x"=x, "y"=y)
+  thismod <- lm(y ~ I(x^2), data=thisdat)
+  results[5,] <- as.numeric(thismod$coefficients[2])
+  #  #B6 on link scale (logit)
+  results[6,] <- coef(modelBoot, raw=TRUE, which=7, parameter="mu")[[1]]
+  # #Phi intercept
+  results[11,] <- 1 / exp(modelBoot$phi$offset +
+                        as.vector(coef(modelBoot)$phi$intercept))
+  #B0 on link scale (logit)
+  results[10,] <- modelBoot$mu$offset + as.vector(coef(modelBoot)$mu$intercept)
+  # yr.sd
+  results[7,] <- sd(as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`))
+  
+  # prediction on test dataset using bootstrapped model
+  testset <- testdata
+  oldw <- getOption("warn")
+  options(warn=-1)
+  temp <- predict(modelBoot, newdata=testset, type="response")
+  options(warn=oldw)
+  testset$predy <- temp$mu
+  testset$predphi <- 1 / temp$phi
+  temp2 <- as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`)
+  testset$mumarg <- inverselogit(logit(testset$predy)-temp2[testset$year])
+  testset$yhat <- rbeta(nrow(testset), testset$mumarg / testset$predphi,
+                        (1 - testset$mumarg) / testset$predphi)
+  results[100:199,] <- testset$mumarg
+  results[200:299,] <- testset$predphi
+  return(results)
+}
+
+####
+####----9.1. Bootstrapping with out-of-sample prediction--------------------####
+####                                                                        
+
+run_bootstrap_oos <- function(nboot, mstopmu, mstopphi, maindata, mainfmu, 
+  mainfphi, testdata, i, nyear) {
+  inverselogit <- function(alpha) {
+    exp(alpha) / ( exp(alpha) + 1 )
+  }
+  logit <- function(p) {
+    log(p/(1-p))
+  }
+  results <- matrix(data=NA, nrow=300, ncol=1)
+  thiscondition <- FALSE
+  while(thiscondition == FALSE){
+    bootingfunc <- function() {
+      tryCatch(
+        expr={
+          thiscond <- TRUE
+          while (thiscond) {
+            keepindex <- sample(1:i, i, replace=TRUE) # get index of obs to keep
+            bootdat <- maindata[keepindex, ] # get dataset
+            thiscond <- length(unique(bootdat$year)) != nyear
+          }
+          results <- FDboostLSS(list(mu=mainfmu, phi=y ~ 1), 
+            timeformula=NULL, 
+            families=BetaLSS(), 
+            data=bootdat, 
+            control=boost_control(mstop=c(mu=mstopmu, 
+              phi=mstopphi), 
+              nu=0.1))
+          return(list(results,keepindex))
+        }, error=function(error_message){
+          out <- error_message
+          return(out)
+        }
+      )
+    }
+    modelBoot <- bootingfunc()
+    if(any(class(modelBoot) == "error")) {
+      temp <- paste(as.character(modelBoot))
+      temp <- str_sub(word(temp,-1),end=-2)
+      mstopmu <- as.numeric(temp) - 1
+    } else {
+      keepindex <- modelBoot[[2]]
+      modelBoot <- modelBoot[[1]]
+      thiscondition <- TRUE
+    }
+  }
+  
+  # Keep mstop values
+  results[20,] <- mstop(modelBoot)[[1]]
+  results[21,] <- mstop(modelBoot)[[2]]
+  
+  #  #B1 on link scale (logit)
+  results[1,] <- coef(modelBoot, raw=TRUE, which=2, parameter="mu")[[1]]
+  #  #B2 on link scale (logit)
+  results[2,] <- coef(modelBoot, raw=TRUE, which=3, parameter="mu")[[1]]
+  #  #B3 on link scale (logit)
+  results[3,] <- coef(modelBoot, raw=TRUE, which=4, parameter="mu")[[1]]
+  #  #B4 on link scale (logit)
+  # results[4,] <- coef(modelBoot, raw=TRUE, which=5, parameter="mu")[[1]]
+  #  #B5 on link scale (logit)
+  # x <- coef(modelBoot)$mu$smterms$`bbs(effort)`$x
+  # y <- coef(modelBoot)$mu$smterms$`bbs(effort)`$value[,1]
+  # thisdat <- data.frame("x"=x, "y"=y)
+  # thismod <- lm(y ~ I(x^2), data=thisdat)
+  # results[5,] <- as.numeric(thismod$coefficients[2])
+  #  #B6 on link scale (logit)
+  results[6,] <- coef(modelBoot, raw=TRUE, which=5, parameter="mu")[[1]]
+  # #Phi intercept
+  results[11,] <- 1 / exp(modelBoot$phi$offset +
+      as.vector(coef(modelBoot)$phi$intercept))
+  #B0 on link scale (logit)
+  results[10,] <- modelBoot$mu$offset + as.vector(coef(modelBoot)$mu$intercept)
+  # yr.sd
+  results[7,] <- sd(as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`))
+  
+  # prediction on test dataset using bootstrapped model
+  dontkeep <- unique(keepindex)
+  testset <- maindata
+  maindata$year <- as.numeric(as.factor(maindata$year))
+  oldw <- getOption("warn")
+  options(warn=-1)
+  temp <- predict(modelBoot, type="response")
+  options(warn=oldw)
+  maindata$predy <- temp$mu
+  maindata$predphi <- 1 / temp$phi
+  temp2 <- as.vector(coef(modelBoot, raw=TRUE)$mu$`brandom(year)`)
+  for(m in 1:nrow(maindata)){
+    maindata$raneff[m] <- temp2[maindata$year[m]]
+    maindata$predy2[m] <- inverselogit(logit(maindata$predy[m]) - maindata$raneff[m])
+    maindata$yhat[m] <-rbeta(1, maindata$predy2[m] / maindata$predphi,
+      (1 - maindata$predy2[m]) / maindata$predphi)
+  }
+  
+  maindata$yhat[dontkeep] <- NA
+  st <- nrow(maindata) + 100 - 1
+  results[100:st,] <- maindata$yhat
+  return(results)
+}
+
+####
+####----9.5 Function to parallelize Bootstrapping---------------------------####
+####                                                                        
+doboot <- function(numcores, pop, fun, ...){
+  require(doParallel)
+  require(foreach)
+  require(gamboostLSS) 
+  require(FDboost)
+  require(stringr)
+  cl <- makeCluster(numcores)
+  registerDoParallel(cl)
+  # clusterExport(cl, varlist=c("FUN", "a", "b"))
+  results <- foreach(pop, .combine=cbind,
+                     .packages=c("gamboostLSS", 
+                                   "FDboost", 
+                                   "stringr")) %dopar% fun(...)
+  stopCluster(cl)
+  return(results)
+}
+
+####
+####----10. Inverse-logit function------------------------------------------####
+
+inverselogit <- function(alpha) {
+  exp(alpha) / ( exp(alpha) + 1 )
+}
+
+####
+####----11. logit function------------------------------------------####
+
+logit <- function(p) {
+  log(p/(1-p))
+}
+
+
+####
+####----12. Parallelize cvrisk() function-----------------------------------####
+
+myApply <- function(dataset, FDboostLSS, ...){
+  myFun <- function(...) {
+    library("mboost")
+    FDboostLSS(...)
+  }
+  parLapply(cl=cl, dataset, myFun, ...)
+}
+
+
+####
+####----13. Customized predictor function (source: iml package at ----------####
+####        https://github.com/giuseppec/iml/blob/main/R/Predictor.R
+
+# This adds "regression" as the task for the iml:Predictor function. Otherwise
+# the task is "unknown" and it throws an error.
+
+inferTaskFromModel.ranger <- function(model) {
+  return("regression")
+}
+
+
+####
+####----14. R2G function-----------------------------------####
+
+r2g <- function(nullLL,fullLL,n) {
+  numer <- 1-(nullLL/fullLL)^(2/n)
+  denom <- 1-nullLL^(2/n)
+  out <- numer/denom
+  return(out)
+}
+
+####
+####----15. R2D function-----------------------------------####
+
+r2d <- function(nullLL,fullLL) {
+  numer <- -2*fullLL
+  denom <- -2*nullLL
+  out <- 1-numer/denom
+  out <- ifelse(is.infinite(out), NA, out)
+  return(out)
+}
+
+####
+####----16. MCMC prediction-----------------------------------####
+
+p.margin.mcmc <- function(x, testset) {
+  tempmu <- inverselogit(x["b_p0"] + 
+                           x["Beta[1]"]*testset$discharge + 
+                           x["Beta[2]"]*testset$temperature + 
+                           x["Beta[3]"]*testset$effort + 
+                           x["Beta[4]"]*
+                           as.numeric(paste(testset$precip)) + 
+                           x["Beta[5]"]*testset$effort^2 + 
+                           x["Beta[6]"]*testset$discharge*
+                           testset$temperature)
+  tempphi <- x["phi"]
+  out <- rbeta(nrow(testset), shape1=tempmu / tempphi, 
+               shape2=(1 - tempmu) / tempphi)
+  return(out)
+}
-- 
GitLab