FH_models for ERL

c75256da · Neha Hunka · 55511d82 · c75256da · c75256da
Commit c75256da authored 1 year ago by Neha Hunka
--- a/country_summaries/.ipynb_checkpoints/FH_ERL_paper-checkpoint.ipynb
+++ b/country_summaries/.ipynb_checkpoints/FH_ERL_paper-checkpoint.ipynb
@@ -342,6 +342,7 @@
    "rownames(Table) <- 1:nrow(Table)\n",
    "colnames(Table) <- c('y ~ x:','Coefficient','SE','t-value','p-value') # p-value > 0.05 indicates no significant systematic deviation \n",
    "Table['y ~ x:'] <- c('Intercept','Slope')\n",
+    "Table <- Table[head(seq_len(ncol(Table)), -2)]\n",
    "Table\n",
    "\n",
    "### Check results against know true values ###\n",
@@ -535,6 +536,7 @@
    "rownames(Table) <- 1:nrow(Table)\n",
    "colnames(Table) <- c('MAP - NFI ~ null:','Coefficient','SE','t-value','p-value') # p-value > 0.05 indicates no significant systematic deviation \n",
    "Table['y ~ x:'] <- c('Intercept')\n",
+    "Table <- Table[head(seq_len(ncol(Table)), -2)]\n",
    "Table\n",
    "\n",
    "### FH model relating difference in MAP and NFI mean AGBD estimates to terrain slope\n",
@@ -552,16 +554,9 @@
    "Table[nrow(Table) + 1,1] = \"Fay-Herriot R-squared = \" # R2 indicates the proportion of variation in MAP minus NFI estimated AGBD explained by terrain-slope  \n",
    "Table[3,2] = round(R2,digits=2)\n",
    "Table[3,3:5] = ''\n",
+    "Table <- Table[head(seq_len(ncol(Table)), -2)]\n",
    "Table"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b7fd9bb4-cd57-4899-8dd8-5bb6958ba51e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {

 %% Cell type:code id:7e1b7e4a-e6f5-47df-a012-1d0502e99bac tags:

 ``` R
 ###############################################################################################################
 ##################### FAY-HERRIOT MODELS ######################################################################
 ###############################################################################################################

 #### The following provides the code associated with artcle (please cite):

 # @article{10.1088/1748-9326/ad0b60,
 # 	author={Hunka, Neha and Santoro, Maurizio and Armston, John and Dubayah, Ralph and McRoberts,
 #           Ronald and Næsset, Erik and Quegan, Shaun and Urbazaev, Mikhail and Pascual, Adrián and May,
 #           Paul B and Minor, David and Leitold, Veronika and Basak, Paromita and Liang, Mengyu and Melo,
 #           Joana and Herold, Martin and Malága, Natalia and Wilson, Sylvia and Montesinos, Patricia Durán
 #           and Arana, Alexs and De La Cruz Paiva, Ricardo Ernesto and Ferrand, Jeremy and Keoka,
 #           Somphavy and Guerra-Hernandez, Juan and Duncanson, Laura},
 # 	title={On the NASA MAP and ESA CCI biomass maps: Aligning for uptake in the UNFCCC global stocktake},
 # 	journal={Environmental Research Letters},
 # 	url={http://iopscience.iop.org/article/10.1088/1748-9326/ad0b60},
 # 	year={2023}
 # }

 #### Please contact authors Paul May (paul.may@sdsmt.edu) and Neha Hunka (nhunka@umd.edu) for further information

 ##################################################################################################################
 ##################################################################################################################
 ##################################################################################################################
 ```

 %% Cell type:code id:8c2b15bd-ae47-4ee6-b183-931860fe969c tags:

 ``` R
 ####################################################################
 # Define functions
 ####################################################################

 round_df = function(df, digits) {
  nums - vapply(df, is.numeric, FUN.VALUE = logical(1))
  df[,nums] - round(df[,nums], digits = digits)
  (df)
 }

 # Function to compute the maximum likelihood estimates (MLEs) for a Fay-Herriot model.
 # The function asks for
 #     x = covariates,
 #     y = AGB map estimates,
 #     vardir = sampling variance of map estimates
 # If the variance associated with x (i.e. NFI estimates) is available, it may be injested too, see example 2 below.

 FHfit = function(y, x, vardir){
  n = length(y) # sample size
  x1 = cbind(rep(1,n), x) # Appends a column of 1's to the covariate matrix to allow an intercept
  p = ncol(x1) # total number of regression coefficients (including the intercept)

  # Below is the negative log-likelihood, dependent on log(\sigma^2),
  # where \sigma^2 is the variance of the regression error
  #   i.e. the error that would remain even with 0 sampling error.
  # The MLEs for regression coefficients are available in closed form
  # GIVEN a fixed \sigma_2, so the likelihood only depends on \sigma_2.
  # The log transformation is the release the strictly positive constraints
  # on \sigma^2. This eases the numerical optimization with respect to \sigma^2.

  nll = function(log.s2, y, x1, vardir){
    s2 = exp(log.s2) # backtransform to yield \sigma_2
    S = s2 + vardir # Compute the diagonal of $\Sigma$, the covariance matrix for the total error (sampling + regression error).
    x1s = x1/S # Compute \Sigma^{-1}x1
    ys = y/S # Compute \Sigma^{-1}y
    Coefficient.hat = solve(crossprod(x1s, x1), crossprod(x1s, y)) #Use the previous two quantities to compute the GLS estimates for the regression coeffecients
    res = y - x1%*%Coefficient.hat # The GLS residuals
    ress = ys - x1s%*%Coefficient.hat # \Sigma^{-1} times the GLS residuals
    obj = sum(log(S)) + crossprod(res, ress)[1,1] #negative log likelihood = log-determinant of the error covariance matrix + dot-product of standardized residuals
    return(obj) # return the negative log-likelihood
  }

  # The following line computes the MLE for \sigma^2 by minimizing the negative log-likelihood.
  #   (minimizing negative is the same as maximizing the positive)
  # A lower bound of \sigma^2 = 0.001 and an
  #   upper bound of \sigma^2 = the sample variance of y
  # is placed to avoid potential numerical issues in the optimization.
  # The upper bound is almost certainly always reasonable
  #       (the regression variance should be less than the total variance)
  # but depending on the units, the lower bound can be changed.
  # Note that if the units are Mg/ha biomass, this lower bound corresponds
  # to a error standard deviation of ~0.03 Mg/ha, implausibly low for most applications...
  # However, if your returned MLE for s2 (\sigma^2) is equal to the lower bound,
  # you should lower this lower bound

  fit = optimize(interval = c(log(0.001), # Lower bound for search
                              log(var(y))), # Upper bound for search
                 f = nll, # The function to be minimized
                 y = y, x1 = x1, vardir = vardir) # The variables nll() needs.

  s2 = exp(fit$minimum) # Undo the log transform to yield the MLE for \sigma^2
  S = s2 + vardir # Compute the diagonal of $\Sigma$, the covariance matrix for the total error (sampling + regression error).
  x1s = x1/S # Compute \Sigma^{-1}x1
  ys = y/S # Compute \Sigma^{-1}y
  Coefficient.hat = solve(crossprod(x1s, x1), crossprod(x1s, y)) #Use the previous two quantities to compute the GLS estimates for the regression coeffecients
  Coefficient.var = chol2inv(chol(crossprod(x1s, x1))) #covariance matrix of the reg. coefficients
  Coefficient.sd = sqrt(diag(Coefficient.var))
  p.vals = 2*pt(Coefficient.hat/Coefficient.sd, df = n-p, lower.tail = F) #p-values, computed using a t distribution
  return(list(est.coefs = Coefficient.hat, coef.sds = Coefficient.sd, p.vals = p.vals, ref.var = s2))
 }
 ```

 %% Cell type:code id:d6fb0c14-be93-4b3e-b851-c1a4b66bef86 tags:

 ``` R
 ###################################################################################
 # Example 1: Simulated data with variance only in the response variable (y)
 ###################################################################################

 ### Generate the data ###
 set.seed(123)
 n = 80 # Sample size for simulated data
 intercept.true = 0 # The true intercept term
 Coefficient.true = 2 # The true coefficient for x
 s2.true = 25 # The true \sigma^2, i.e. the regression variance, or random effects variance
 vardir = runif(n, 25, 100) # Random sampling varainces

 x = runif(n, 0, 20) # Random covariates
 y.true = intercept.true + x*Coefficient.true + rnorm(n, sd = sqrt(s2.true)) # Simulate y with no sampling variance

 ### Plot the true data and the data with sampling variance ###
 par(mfrow = c(2,2))
 plot(x, y.true, main = "No sampling error")
 y.est = y.true + rnorm(n, sd = sqrt(vardir)) # Add the sampling variance.
 plot(x, y.est, main = "With sampling error")

 ### Fit the model ###
 sim.fit = FHfit(y = y.est, x = x, vardir = vardir)
 Table = data.frame(round(sim.fit$est.coefs,digits=2), round(sim.fit$coef.sds,digits=2), round(sim.fit$est.coefs/sim.fit$coef.sds,digits=2), round(sim.fit$p.vals,digits=2))
 Table <- cbind( . = rownames(Table), Table)
 rownames(Table) <- 1:nrow(Table)
 colnames(Table) <- c('y ~ x:','Coefficient','SE','t-value','p-value') # p-value > 0.05 indicates no significant systematic deviation
 Table['y ~ x:'] <- c('Intercept','Slope')
+Table <- Table[head(seq_len(ncol(Table)), -2)]
 Table

 ### Check results against know true values ###
 # The estimated regression coefficients...
 sim.fit$est.coefs
 # ... compared to the actual values
 c(intercept.true, Coefficient.true)
 # They are well within the uncertainties of each other
 (sim.fit$est.coefs - c(intercept.true, Coefficient.true))/sim.fit$coef.sds
 # And the estimated \sigma^2 (i.e. regression variance or random effects variance)
 sim.fit$ref.var
 # ... compared to actual value
 s2.true
 # ... looking good!
 ```

 %% Output

    
    A data.frame: 2 × 5
    
    | <!--/--> | y ~ x: &lt;chr&gt; | Coefficient &lt;dbl&gt; | SE &lt;dbl&gt; | t-value &lt;dbl&gt; | p-value &lt;dbl&gt; |
    |---|---|---|---|---|---|
    | 1 | Intercept | 0.08 | 2.04 |  0.04 | 0.97 |
    | 2 | Slope     | 1.99 | 0.18 | 10.82 | 0.00 |
    
    A data.frame: 2 × 5
    \begin{tabular}{r|lllll}
      & y \textasciitilde{} x: & Coefficient & SE & t-value & p-value\\
      & <chr> & <dbl> & <dbl> & <dbl> & <dbl>\\
    \hline
    	1 & Intercept & 0.08 & 2.04 &  0.04 & 0.97\\
    	2 & Slope     & 1.99 & 0.18 & 10.82 & 0.00\\
    \end{tabular}

    
    A matrix: 2 × 1 of type dbl
    
    | <!----> | 0.08441531 |
    | x | 1.98601957 |
    
    A matrix: 2 × 1 of type dbl
    \begin{tabular}{r|l}
    	 & 0.08441531\\
    	x & 1.98601957\\
    \end{tabular}

    1. 0
    2. 2
    
    
    \begin{enumerate*}
    \item 0
    \item 2
    \end{enumerate*}

    
    A matrix: 2 × 1 of type dbl
    
    | <!----> |  0.04130370 |
    | x | -0.07615911 |
    
    A matrix: 2 × 1 of type dbl
    \begin{tabular}{r|l}
    	 &  0.04130370\\
    	x & -0.07615911\\
    \end{tabular}

    25.0639250909692
    25.0639250909692

    25
    25



 %% Cell type:code id:1b3ea6fc-a7b7-4dcf-8be8-af4f0dc87df0 tags:

 ``` R
 ################################################################################
 # Example 2: Simulated NFI means and AGB map means, both with estimated variance
 ################################################################################
 data <- read.csv('/projects/my-private-bucket/Data/PAPER1_Private/NFI_MAP_AGBD_mock.csv')
 data[1:5,]

 NFI = data$NFI_AGBD_mean_estimate # Mean AGBD estimate from National Forest Inventory
 NFI.var = data$NFI_AGBD_var_estimate # Variance of AGBD estimate from National Forest Inventory
 MAP = data$AGBD_Map_mean_estimate # Mean AGBD estimate from EO-based map
 MAP.var = data$AGBD_Map_var_estimate # Variance of AGBD estimate from EO-based map
 slope = data$Slope_degrees # Estimate of topographic slope in degrees

 ### FH model relating MAP and NFI mean AGBD estimates
 mod1 =FHfit(y = MAP-NFI, x = NULL, vardir = NFI.var + MAP.var)
 Table = data.frame(round(mod1$est.coefs,digits=2), round(mod1$coef.sds,digits=2), round(mod1$est.coefs/mod1$coef.sds,digits=2), round(mod1$p.vals,digits=2))
 Table <- cbind( . = rownames(Table), Table)
 rownames(Table) <- 1:nrow(Table)
 colnames(Table) <- c('MAP - NFI ~ null:','Coefficient','SE','t-value','p-value') # p-value > 0.05 indicates no significant systematic deviation
 Table['y ~ x:'] <- c('Intercept')
+Table <- Table[head(seq_len(ncol(Table)), -2)]
 Table

 ### FH model relating difference in MAP and NFI mean AGBD estimates to terrain slope
 diff = MAP - NFI
 intercept = rep(1, length(MAP))
 my.mod = FHfit(y = diff, x = slope, vardir = NFI.var + MAP.var)
 null.mod = FHfit(y = diff, x = NULL, vardir = NFI.var + MAP.var)

 R2 = 1 - my.mod$ref.var/null.mod$ref.var
 Table = data.frame(round(my.mod$est.coefs,digits=2), round(my.mod$coef.sds,digits=2), round(my.mod$est.coefs/my.mod$coef.sds,digits=2), round(my.mod$p.vals,digits=2))
 Table <- cbind( . = rownames(Table), Table)
 rownames(Table) <- 1:nrow(Table)
 colnames(Table) <- c('MAP - NFI ~ Slope:','Coefficient','SE','t-value','p-value') # p-value < 0.05 on slope shows significant influence of terrain-slope
 Table['MAP - NFI ~ Slope:'] <- c('Intercept','Slope')
 Table[nrow(Table) + 1,1] = "Fay-Herriot R-squared = " # R2 indicates the proportion of variation in MAP minus NFI estimated AGBD explained by terrain-slope
 Table[3,2] = round(R2,digits=2)
 Table[3,3:5] = ''
+Table <- Table[head(seq_len(ncol(Table)), -2)]
 Table
 ```

 %% Output

    
    A data.frame: 5 × 6
    
    | <!--/--> | STRATUM_ID &lt;int&gt; | NFI_AGBD_mean_estimate &lt;dbl&gt; | NFI_AGBD_var_estimate &lt;int&gt; | AGBD_Map_mean_estimate &lt;dbl&gt; | AGBD_Map_var_estimate &lt;int&gt; | Slope_degrees &lt;int&gt; |
    |---|---|---|---|---|---|---|
    | 1 | 0 | 88.33389 | 8390 | 224.84103 | 146 | 28 |
    | 2 | 1 | 76.35538 |  405 |  83.41033 |  21 | 17 |
    | 3 | 2 | 75.22345 |  381 | 120.67073 |  25 | 23 |
    | 4 | 3 | 61.54510 |  481 | 120.49180 |  41 | 24 |
    | 5 | 4 | 53.74778 |  341 |  23.59085 |  10 |  1 |
    
    A data.frame: 5 × 6
    \begin{tabular}{r|llllll}
      & STRATUM\_ID & NFI\_AGBD\_mean\_estimate & NFI\_AGBD\_var\_estimate & AGBD\_Map\_mean\_estimate & AGBD\_Map\_var\_estimate & Slope\_degrees\\
      & <int> & <dbl> & <int> & <dbl> & <int> & <int>\\
    \hline
    	1 & 0 & 88.33389 & 8390 & 224.84103 & 146 & 28\\
    	2 & 1 & 76.35538 &  405 &  83.41033 &  21 & 17\\
    	3 & 2 & 75.22345 &  381 & 120.67073 &  25 & 23\\
    	4 & 3 & 61.54510 &  481 & 120.49180 &  41 & 24\\
    	5 & 4 & 53.74778 &  341 &  23.59085 &  10 &  1\\
    \end{tabular}

    
    A data.frame: 1 × 6
    
    | <!--/--> | MAP - NFI ~ null: &lt;chr&gt; | Coefficient &lt;dbl&gt; | SE &lt;dbl&gt; | t-value &lt;dbl&gt; | p-value &lt;dbl&gt; | y ~ x: &lt;chr&gt; |
    |---|---|---|---|---|---|---|
    | 1 | 1 | -7.28 | 3.17 | -2.3 | 1.98 | Intercept |
    
    A data.frame: 1 × 6
    \begin{tabular}{r|llllll}
      & MAP - NFI \textasciitilde{} null: & Coefficient & SE & t-value & p-value & y \textasciitilde{} x:\\
      & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <chr>\\
    \hline
    	1 & 1 & -7.28 & 3.17 & -2.3 & 1.98 & Intercept\\
    \end{tabular}

    
    A data.frame: 3 × 5
    
    | <!--/--> | MAP - NFI ~ Slope: &lt;chr&gt; | Coefficient &lt;dbl&gt; | SE &lt;chr&gt; | t-value &lt;chr&gt; | p-value &lt;chr&gt; |
    |---|---|---|---|---|---|
    | 1 | Intercept                | -55.91 | 7.45 | -7.5 | 2 |
    | 2 | Slope                    |   3.02 | 0.43 | 6.96 | 0 |
    | 3 | Fay-Herriot R-squared =  |   0.62 | <!----> | <!----> | <!----> |
    
    A data.frame: 3 × 5
    \begin{tabular}{r|lllll}
      & MAP - NFI \textasciitilde{} Slope: & Coefficient & SE & t-value & p-value\\
      & <chr> & <dbl> & <chr> & <chr> & <chr>\\
    \hline
    	1 & Intercept                & -55.91 & 7.45 & -7.5 & 2\\
    	2 & Slope                    &   3.02 & 0.43 & 6.96 & 0\\
    	3 & Fay-Herriot R-squared =  &   0.62 &      &      &  \\
    \end{tabular}
-
-%% Cell type:code id:b7fd9bb4-cd57-4899-8dd8-5bb6958ba51e tags:
-
-``` R
-```

--- a/country_summaries/FH_ERL_paper.ipynb
+++ b/country_summaries/FH_ERL_paper.ipynb