From 1d0311ffef31aeeded82ff50fb7ca39e13be6ed2 Mon Sep 17 00:00:00 2001
From: nehajo88 <nehajo88@gmail.com>
Date: Wed, 6 Mar 2024 20:46:53 -0800
Subject: [PATCH] MOZ

---
 WUR_MOZ_2024/1. MOZ_data_preparation.Rmd      | 194 ++++++++++
 .../2. MOZ_data_preparation_ccibiomass.Rmd    | 129 +++++++
 WUR_MOZ_2024/3. calculating polygon means.Rmd | 210 ++++++++++
 WUR_MOZ_2024/4. Model-assisted estimation.Rmd | 292 ++++++++++++++
 WUR_MOZ_2024/Mozambique_MA.ipynb              | 361 ++++++++++++++++++
 5 files changed, 1186 insertions(+)
 create mode 100644 WUR_MOZ_2024/1. MOZ_data_preparation.Rmd
 create mode 100644 WUR_MOZ_2024/2. MOZ_data_preparation_ccibiomass.Rmd
 create mode 100644 WUR_MOZ_2024/3. calculating polygon means.Rmd
 create mode 100644 WUR_MOZ_2024/4. Model-assisted estimation.Rmd
 create mode 100644 WUR_MOZ_2024/Mozambique_MA.ipynb

diff --git a/WUR_MOZ_2024/1. MOZ_data_preparation.Rmd b/WUR_MOZ_2024/1. MOZ_data_preparation.Rmd
new file mode 100644
index 0000000..b8183c5
--- /dev/null
+++ b/WUR_MOZ_2024/1. MOZ_data_preparation.Rmd	
@@ -0,0 +1,194 @@
+---
+title: "Downloading and pre-processing Mozambique's NFI data"
+output: html_notebook
+---
+
+This code and text has been adapted from Malaga et al. (under review), "Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries". This study has been carried out in collaboration with the MRV Unit, National Sustainable Development Fund (FNDS) in Mozambique, among many other collaborators. In order to use Mozambique's NFI data, careful attention must be taken to the NFI's sampling design, forest strata, among many other factors. The objective of this notebook is to show the steps followed to prepare Mozambique's NFI dataset for its future integration with CCI Biomass data. 
+
+The Mozambique NFI followed a restricted stratified random sampling approach, with the stratification based on an [agroecological map](https://www.fnds.gov.mz/mrv/index.php/documentos/relatorios/26-inventario-florestal-nacional/fileMaputo). The forest strata are:
+
+* Mopane forests
+* Mercrusse forests
+* Semi-decidouous forests
+* Semi-evergreen forests
+
+Plot locations were initally randomly selected but later decided to move to random locations to grid intersections. Upon visual inspection, some clusters of the original design were further re-located, because of discrepancies between the map forest strata and the ground conditions. Each cluster connsists of four 50m-by-20m (0,1 ha) plots at 50 m spacing configured in a square, each divided in four subplots of 25m-by-10m. All trees with DBH greater than 10 cm were measured within a plot and trees with DBH greater than 5 cm and lower than 10 cm were measured within the first subplot.
+
+
+```{r include=FALSE}
+setwd("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks")
+```
+
+```{r echo=FALSE}
+knitr::include_graphics("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/Mozambique NFI.png")
+```
+
+The pre-processing steps followed here entail filtering out clusters with missing or duplicated coordinates. In addition, clusters that did not fall within the four forest strata were disregarded. Clusters crossing stratum boundaries were assigned to the majority clusters. 
+
+# Loading packages
+```{r, echo=TRUE}
+library(openxlsx)
+library(sf)
+library(dplyr)
+library(geosphere)
+library(lmtest)
+library(ggplot2)
+library(lmtest)
+library(terra)
+library(sp)
+```
+
+## Loading data
+
+Load the NFI database
+```{r}
+NFI_data=openxlsx::read.xlsx("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Data_IFN_plt.xlsx")
+```
+
+Load the Forest stratification layers
+```{r}
+Moz_strata_geo<-st_read("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Strata_reclassify_geogra.shp")  #WGS_84
+```
+# 1. Filtering out NA coordinates within the NFI plots
+
+The original dataset includes 3272 rows with information (3272 plots)
+There are 3272 rows with coordinates and 148 with no coordinates
+```{r}
+table(is.na(NFI_data$Ycoordinate))
+```
+Remove the raws with NA coordinates and save new DataFrame as `NFI_data_c`
+```{r}
+NFI_data_c<-NFI_data[!is.na(NFI_data$Xcoordinate),] #remove NA coordinates
+NFI_data_c<-NFI_data[!is.na(NFI_data$Ycoordinate),] #remove NA coordinates
+```
+
+# 2. Filtering out duplicated clusters  
+To identify plot with duplicated coordinates, we will transform `NFI_data_c` to a Large SpatialPointsDataFrame
+```{r}
+coordinates(NFI_data_c) =~ Xcoordinate + Ycoordinate 
+```
+
+Then, we will define the projection: WGS84
+```{r}
+crs_wgs84 <- CRS(SRS_string = "EPSG:4326") # WGS 84 has EPSG code 4326
+slot(NFI_data_c, "proj4string") <- crs_wgs84
+```
+
+Create `TestFram` to identify duplicated coordinated, which are a total of 8 plots with duplicated coordinates
+```{r}
+TestFram=paste((NFI_data_c@coords[,1]),as.numeric(NFI_data_c@coords[,2]))
+TestFrameCounts<-as.data.frame(table(TestFram))
+TestFrameCounts[TestFrameCounts$Freq==max(TestFrameCounts$Freq),] #identify plots with duplicated 
+```
+
+The coordinates in `TestFram` will be used to identify the plots/clusters that need to be removed
+```{r}
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.4935005878"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.4935032236"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.4944496012999"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.4944522421"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.5660090220999"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.566011865"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.5669523366999"]
+NFI_data_c$id_parcela[NFI_data_c@coords[,1]=="33.5669551842999"]
+```
+The 8 duplicated plots belong to the following clusters: 
+
+* 216 & 218
+* 265 & 269
+
+We will remove clusters 218 and 269 and save the new dataframe as `NFI_Data_c2`
+
+```{r}
+NFI_data_c2<-NFI_data_c[!(NFI_data_c$Cluster%in%c(218,269)),]
+nrow(NFI_data_c)-nrow(NFI_data_c2)
+```
+
+# 3. Incorporating the Stratification information to the NFI database
+
+The population of interest was determined by the original stratification layer used in the sampling design: the 2008 re-classified agro-ecological map (MITADER, 2018). The forest strata (classes) were ruled by the stratification layer. The forest strata are: Mopane (Mo), Mecrusse (Me), Semi-deciduous forest (SDF) and Semi-evergreen forests (SEF). 
+
+First, convert to sf data frame for st_join to work
+```{r}
+NFI_data_strata = st_as_sf(x=NFI_data_c2, 
+                    coords = c("Xcoordinate", "Ycoordinate"),
+                    crs = crs(Moz_strata_geo)) #Geographic   
+```
+ 
+                    
+Check that projections are the same
+```{r}
+st_crs(Moz_strata_geo)==st_crs(NFI_data_strata)
+```
+Examine the stratification layer
+```{r}
+summary(Moz_strata_geo)
+```
+We're interested in the Forest Strata, therefore we will extract `FOREST_STR` from `Moz_strata_geo` and save as `NFI_data_strata`
+```{r}
+Moz_strata_layer<-Moz_strata_geo[,"FOREST_STR"]
+sf::sf_use_s2(FALSE) # needed when using geographic coord but not UTM
+NFI_data_strata=st_join(NFI_data_strata, Moz_strata_layer) # join/extract values of shapefile to spatial points
+```
+Remove funny duplicated rows that appear from the previous procedure: 
+
+```{r}
+NFI_data_strata[duplicated(NFI_data_strata),]
+NFI_data_strata = NFI_data_strata[!duplicated(NFI_data_strata),]
+nrow(NFI_data_strata)
+as.data.frame(NFI_data_strata)
+
+```
+
+Convert it back to a dataframe that includes the stratification info per plot
+
+```{r}
+NFI_data_df=NFI_data_strata %>% st_drop_geometry()
+NFI_data_df = as.data.frame(NFI_data_df)
+NFI_data_df$x=st_coordinates(NFI_data_strata)[,1]#re-assign the coord X
+NFI_data_df$y=st_coordinates(NFI_data_strata)[,2]#re-assign the coord Y
+```
+
+
+# 4. Re-define strata so all clusters fall within the same strata.
+
+"Clusters crossing stratum boundaries were assigned to the majority stratum" 
+
+Ruling: if there are NA plots within a cluster, assign the class corresponding to the first plot within the cluster. If the first plot is class NA, than use the next plot with a valid class... If all plots within a cluster are NA, than maintain the NA for the whole cluster
+
+Create `FOREST_STR_NEW`
+```{r}
+NFI_data_df$FOREST_STR_NEW=NA
+
+NFI_data_df[NFI_data_df$FOREST_STR == "NA",]$FOREST_STR=NA #The function below needs for all "NA"s to be turned into real NAs
+```
+
+Re-assign the forest strata to the original data set as explained above
+```{r}
+for(i in unique(NFI_data_df$Cluster_new)){
+  
+  selected = which(NFI_data_df$Cluster_new == i)
+  
+  NFI_data_df[selected,]$FOREST_STR_NEW = ifelse(is.null(names(which.max(table(c(NFI_data_df[selected,]$FOREST_STR))))),NA,NFI_data_df[selected,]$FOREST_STR[!is.na(NFI_data_df[selected,]$FOREST_STR)][1])
+}
+```
+
+
+Additional step: remove clusters with Strata=NA
+
+"Clusters that did not fall within the four forest strata as defined by the re-classified Agro-ecological map (our area of interest) were disregarded, assuming they had been moved and departed from the original design."
+
+```{r}
+NFI_data_df[(is.na(NFI_data_df$FOREST_STR_NEW)),c(4,6,12,15)] # daniela: change columns if these are not the ones of interest
+```
+
+Remove rows with NA for strata
+```{r}
+NFI_data_n=NFI_data_df[!is.na(NFI_data_df$FOREST_STR_NEW),]
+length(NFI_data_df$FOREST_STR_NEW)-length(NFI_data_n$FOREST_STR_NEW)
+```
+Save the cleaned database `NFI_data_n`
+
+```{r}
+openxlsx::write.xlsx(NFI_data_n, ("Data_INF_plt_clean_strata_noNA2.xlsx"))
+```
diff --git a/WUR_MOZ_2024/2. MOZ_data_preparation_ccibiomass.Rmd b/WUR_MOZ_2024/2. MOZ_data_preparation_ccibiomass.Rmd
new file mode 100644
index 0000000..89d82f6
--- /dev/null
+++ b/WUR_MOZ_2024/2. MOZ_data_preparation_ccibiomass.Rmd	
@@ -0,0 +1,129 @@
+---
+title: "2. CCI biomass Pre-processing steps"
+output: html_notebook
+---
+
+This code and text has been adapted from Malaga et al. (under review), "Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries". In the previous notebook ("1. Downloading and pre-processing Mozambique's NFI data"), we have explained the pre-processing steps needed to adapt Mozambique's NFI for its integration with CCI Biomass data. 
+
+In this notebook, we will show how we adapted the European Space Angency Climate Change Initiative (CCI) version 4 [global biomass maps](https://catalogue.ceda.ac.uk/uuid/af60720c1e404a9e9d2c145d2b2ead4e) for its subsequent integration with Mozambique's NFI. To minimize discrepancies product of temporal mismatches, the CCI Biomass epoch closest to Mozambique's NFI implementation period was chosen, which was CCI Biomass year 2017 (v.4). 
+
+#Loading packages
+```{r, echo=TRUE}
+library(sp)
+library(sf)
+library(terra)
+library(raster)
+```
+# 0. Download CCI Biomass (2017 v4) for Mozambique 
+Tiles of CCI Biomass year 2017 v4 can be found in the following link: [https://data.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v4.0/geotiff/2017](https://data.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v4.0/geotiff/2017). Save tiles into your working directory. 
+
+Load tiles from your working directory
+```{r include=FALSE}
+setwd("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/CCI 2017 v4") #set the your working directory
+
+tile1<-raster("S10E030_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif")
+tile2<-raster("S10E040_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif")
+tile3<-raster("S20E030_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif")
+```
+# 1. Merge the original CCI map tiles into a Mosaic `CCI_2017` and save
+```{r}
+CCI_2017<-merge(tile1,tile2,tile3) #this process might take some time
+plot(CCI_2017)
+```
+
+# 2. Crop the CCI mosaic to the extent of the stratification layer (our defined population)
+
+Load the stratification layer and check its extent and coordinate reference system
+```{r}
+Moz_strata_geo<-st_read("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Strata_reclassify_geogra.shp") #WGS_84
+extent(Moz_strata_geo)
+plot(Moz_strata_geo["FOREST_STR"])
+```
+Crop `CCI_2017`, and then force again the extent in order for the raster to overlay 
+```{r}
+CCI_crop<-crop(CCI_2017, extent(Moz_strata_geo)) #crop the biomass map to the  extent of the study area
+extent(Moz_strata_geo)==extent(CCI_crop) #check that extents match
+bb <- extent(Moz_strata_geo)
+extent(CCI_crop) <- bb
+extent(Moz_strata_geo)==extent(CCI_crop) #check again if extents match now
+```
+# 3. Crop the CCI Biomass into Mozambique's 4 forest strata 
+
+As seen above, there are four types of Forest strata: Semi decidious forest, Semi evergreen forests, Mopane forests and Mercrusse forests. Let's mask `CCI crop` for each Forest strata
+
+* Mopane forests
+```{r}
+library(terra)
+mask_Mopane<-Moz_strata_geo[Moz_strata_geo$FOREST_STR=="Mopane",]
+CCI_Mopane <- terra::crop(CCI_crop, mask_Mopane, mask=TRUE) 
+CCI_Mopane <- as(CCI_Mopane, "SpatRaster") # read as SpatRaster
+mask_Mopane <- vect(mask_Mopane) #read as terra-friendly format (SpatVector)
+CCI_Mopane <- terra::mask(CCI_Mopane, mask_Mopane)
+plot(CCI_Mopane)
+```
+
+* Mercrusse forests
+```{r}
+mask_Mecrusse<-Moz_strata_geo[Moz_strata_geo$FOREST_STR=="Mecrusse",]
+CCI_Mecrusse <- terra::crop(CCI_crop, mask_Mecrusse, mask=TRUE) 
+CCI_Mecrusse <- as(CCI_Mecrusse, "SpatRaster") # read as SpatRaster
+mask_Mecrusse <- vect(mask_Mecrusse) #read as terra-friendly format (SpatVector)
+CCI_Mecrusse <- terra::mask(CCI_Mecrusse, mask_Mecrusse)
+plot(CCI_Mecrusse)
+```
+
+* Semi deciduous forests
+```{r}
+mask_SDF<-Moz_strata_geo[Moz_strata_geo$FOREST_STR=="Semi deciduous forest",]
+CCI_SDF <- terra::crop(CCI_crop, mask_SDF, mask=TRUE) 
+CCI_SDF <- as(CCI_SDF, "SpatRaster") # read as SpatRaster
+mask_SDF <- vect(mask_SDF) #read as terra-friendly format (SpatVector)
+CCI_SDF <- terra::mask(CCI_SDF, mask_SDF)
+plot(CCI_SDF)
+```
+
+* Semi evergreen forests
+```{r}
+mask_SEF<-Moz_strata_geo[Moz_strata_geo$FOREST_STR=="Semi evergreen forest",]
+CCI_SEF <- terra::crop(CCI_crop, mask_SEF, mask=TRUE) 
+CCI_SEF <- as(CCI_SEF, "SpatRaster") # read as SpatRaster
+mask_SEF <- vect(mask_SEF) #read as terra-friendly format (SpatVector)
+CCI_SEF <- terra::mask(CCI_SEF, mask_SEF)
+plot(CCI_SEF)
+```
+
+
+Don't forget to save ;)
+```{r}
+writeRaster(CCI_Mopane, "CCI_Mopane.tif", overwrite=TRUE)
+writeRaster(CCI_Mecrusse, "CCI_Mecrusse.tif", overwrite=TRUE)
+writeRaster(CCI_SDF, "CCI_SDF.tif", overwrite=TRUE)
+writeRaster(CCI_SEF, "CCI_SEF.tif", overwrite=TRUE)
+```
+
+# 4.  Aggregate the per-stratum maps to define the population
+
+A PSU consists of a square polygon whose dimensions are large enough to encompass all plots within a cluster. See figure of Mozambique's plot design below. To obtain PSU-equivalent map AGB estimates for the population (synthetic estimator, later seen in the estimator), CCI map units were aggregated to polygons of the same size as defined by the PSU. This implies aggregating the map by a factor of 2. 
+
+```{r include=FALSE}
+setwd("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks")
+```
+
+```{r echo=FALSE}
+knitr::include_graphics("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/Mozambique NFI.png")
+```
+
+Aggregate the map y a factor of 2
+```{r}
+CCI_Mo_ag<-aggregate(CCI_Mopane,fact=2,fun=mean,na.rm=TRUE) 
+CCI_Me_ag<-aggregate(CCI_Mecrusse,fact=2,fun=mean,na.rm=TRUE)
+CCI_SDF_ag<-aggregate(CCI_SDF,fact=2,fun=mean,na.rm=TRUE)
+CCI_SEF_ag<-aggregate(CCI_SEF,fact=2,fun=mean,na.rm=TRUE)
+```
+Don't forget to save aggregated rasters ;)
+```{r}
+writeRaster(CCI_Mo_ag, "CCI_Mopane_agg2.tif", overwrite=TRUE)
+writeRaster(CCI_Me_ag, "CCI_Mecrusse_agg2.tif", overwrite=TRUE)
+writeRaster(CCI_SDF_ag,"CCI_SDF_agg2.tif", overwrite=TRUE)
+writeRaster(CCI_SEF_ag,"CCI_SEF_agg2.tif", overwrite=TRUE)
+```
diff --git a/WUR_MOZ_2024/3. calculating polygon means.Rmd b/WUR_MOZ_2024/3. calculating polygon means.Rmd
new file mode 100644
index 0000000..2fc7acb
--- /dev/null
+++ b/WUR_MOZ_2024/3. calculating polygon means.Rmd	
@@ -0,0 +1,210 @@
+---
+title: "3. Extracting and computing AGB means over the remote sensing-based sampling units (PSUs)"
+output: html_notebook
+---
+
+This code and text has been adapted from Malaga et al. (under review), "Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries". In the previous two notebooks ("1. Downloading and pre-processing Mozambique's NFI data" and "2. CCI biomass Pre-processing steps" ), we have explained the pre-processing steps needed to adapt Mozambique's NFI and the CCI Biomass (yr. 2017 v4) for their integration. 
+
+In order to compare NFI clusters with the adapted biomass map, we first need to compute AGB mean values over the remote sensing sampling unit (PSU). As a reminder, a PSU is defined as a square polygon whose dimensions are large enough to encompass all plots within a cluster. 
+
+## Load pacakges
+```{r}
+library(terra)
+library(geosphere)
+library(dplyr)
+library(sf)
+```
+
+A regression model associating map and field-based AGB values is fitted within each forest stratum. The dependent variable corresponds to the mean AGB over the plots within the i-th PSU and the independent variable, is the mean AGB over the CCI biomass map units within the i-th PSU. With this  script, we are processing the latter.
+
+## Extract CCI Map values for each  PSU polygon (sampling unit) (y-caret)
+
+# 1) Define functions 
+
+Define coordinate reference system
+```{r}
+SRS <- st_crs(4326)
+```
+
+The function `MakePolygon` makes the PSUs polygons, a polygon large enough to encompass all plots within a NFI cluster. The length (`L`) refers to the size of the edge and the width (`w`) refers to a [safe] buffer distance from a cluster's central coordinate to the SW corner of the polygon
+
+```{r}
+MakePolygon <- function(df_coord, L,w) { #width refers to the length of the square and w to the buffer width
+  N_CP<-destPoint(df_coord,225,sqrt(w^2+w^2)) #SW corner of the polygon
+  Vertix = as.data.frame(rbind(N_CP,destPoint(N_CP,0,L),destPoint(N_CP,45,sqrt(L^2+L^2)),destPoint(N_CP,90,L))) #four corners of the polygon
+  Vertix_sp <- st_as_sf(x = Vertix, #convert points into spatial data
+                          coords = c("lon", "lat"),
+                          crs = SRS)
+  my.polygon = Vertix_sp %>% # turns points into polygons
+    summarise(geometry = st_combine(geometry)) %>%
+    st_cast("POLYGON")
+    return(my.polygon)
+}
+```
+
+Define function `MapValues` to extract the CCI biomass map values within the created polygons (`pol`) (PSUs) using the weighted mean 
+```{r}
+Mapvalues <- function(pol, wghts=T){ 
+  MapV <- numeric()
+  ras <-CCIbi #CCI Biomass map raster
+  
+  vls <- matrix(ncol=2, nrow=0)
+  for(f in ras) 
+    if(file.exists(f))
+      
+    {vls <- rbind(vls, terra::extract((rast(f)), vect(pol), weights=TRUE #give the Map values with the fraction of the pixel it covers
+    ))
+    
+    MapV <- c(MapV, stats::weighted.mean(vls[,2],vls[,3],na.rm=TRUE))} #give a weighted mean between the CCI values and the fraction of the pixel, ignore all NA (non-forest)
+  MapV
+}
+
+```
+
+
+# 2. Extract map values per stratum
+
+*if not run before*
+```{r}
+NFI_data=openxlsx::read.xlsx("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Data_INF_plt_clean_strata_noNA2.xlsx")
+```
+
+* Semi Decidious Forests (SDF): PSU size = 198m x 198 m ~ 2x2 CCI biomass pixels.
+```{r}
+CCIbi<-"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_SDF.tif" #change for each ecozone
+file.exists("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_SDF.tif")
+```
+Subset plots in the corresponding forest strata, select coordinates
+```{r}
+Pol_subset_sdf<-NFI_data[(NFI_data$FOREST_STR_NEW=="Semi deciduous forest"&NFI_data$Plot==1),c("x","y")] 
+nrow(Pol_subset_sdf)
+```
+
+Using the previous functions, create the polygons (PSUs), extract the CCI biomass maps values for the pixels intersecting the PSUs and calculate mean weighted mean of those pixel values
+```{r}
+polygon_Tmp=c() #create a vector called polygon_Tmp
+Mapbiomass_SDF=rep(NA,nrow(Pol_subset_sdf)) #fill in the vector with repeated 0s for as many lines as the number of clusters per strata
+for(i in 1:nrow(Pol_subset_sdf)){
+  polygon_Tmp=MakePolygon(df_coord=Pol_subset_sdf[i,],198,40) # 198m is the length of the a side of the polygon (PSU), and 40m is the buffer distance given from the cluster's central coordinate to the SW corner of the polygon. 
+  Mapbiomass_SDF[i]=Mapvalues(pol=polygon_Tmp,wghts = T) [1]
+}
+```
+
+Create `Mapbiomass_SDF` to store PSUs map value means 
+```{r}
+str(Mapbiomass_SDF)
+```
+
+Now, repeat the procedure for the other 3 remaining strata
+
+* Semi Evergreen Forests (SEF): PSU size = 198m x 198 m
+```{r}
+CCIbi<-"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_SEF.tif" #change for each ecozone
+file.exists("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_SEF.tif")
+```
+Select the SW plot coordinate of every cluster
+```{r}
+Pol_subset_sef<-NFI_data[(NFI_data$FOREST_STR_NEW=="Semi evergreen forest"&NFI_data$Plot==1),c("x","y")]
+nrow(Pol_subset_sef)
+```
+Create polygons and calculate mean CCI Biomass values
+```{r}
+polygon_Tmp=c() 
+Mapbiomass_SEF=rep(NA,nrow(Pol_subset_sef)) 
+for(i in 1:nrow(Pol_subset_sef)){
+  polygon_Tmp=MakePolygon(df_coord=Pol_subset_sef[i,],198,40)
+  Mapbiomass_SEF[i]=Mapvalues(pol=polygon_Tmp,wghts = T) [1]
+}
+```
+
+Store those PSUs map value means as stored as `Mapbiomass_SEF`
+```{r}
+str(Mapbiomass_SEF)
+```
+
+
+* Mercrusse Forest (ME): PSU size = 198m x 198 m
+```{r}
+CCIbi<-"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_Mecrusse.tif" #change for each ecozone
+file.exists("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_Mecrusse.tif")
+```
+
+Select the SW plot coordinate of every cluster
+```{r}
+Pol_subset_Me<-NFI_data[(NFI_data$FOREST_STR_NEW=="Mecrusse"&NFI_data$Plot==1),c("x","y")] 
+nrow(Pol_subset_Me)
+```
+
+Create polygons and calculate mean CCI Biomass values
+```{r}
+polygon_Tmp=c() 
+Mapbiomass_ME=rep(NA,nrow(Pol_subset_Me)) 
+for(i in 1:nrow(Pol_subset_Me)){
+  polygon_Tmp=MakePolygon(df_coord=Pol_subset_Me[i,],198,40)
+  Mapbiomass_ME[i]=Mapvalues(pol=polygon_Tmp,wghts = T) [1]
+}
+```
+
+Store those PSUs map value means as stored as `Mapbiomass_ME`
+```{r}
+str(Mapbiomass_ME)
+```
+
+* Mopane Forests (MO): PSU size = 198m x 198 m
+```{r}
+CCIbi<-"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_Mopane.tif" #change for each ecozone
+file.exists("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/CCI_Mopane.tif")
+```
+
+Select the SW plot coordinate of every cluster
+```{r}
+Pol_subset_Mo<-NFI_data[(NFI_data$FOREST_STR_NEW=="Mopane"&NFI_data$Plot==1),c("x","y")]
+nrow(Pol_subset_Mo)
+```
+Create polygons and calculate mean CCI Biomass values
+```{r}
+polygon_Tmp=c() 
+Mapbiomass_MO=rep(NA,nrow(Pol_subset_Mo)) 
+for(i in 1:nrow(Pol_subset_Mo)){
+  polygon_Tmp=MakePolygon(df_coord=Pol_subset_Mo[i,],198,40)
+  Mapbiomass_MO[i]=Mapvalues(pol=polygon_Tmp,wghts = T) [1]
+}
+```
+
+Store those PSUs map value means as stored as `Mapbiomass_MO`
+```{r}
+str(Mapbiomass_MO)
+```
+
+# 3. Bind all data frames
+
+Append the PSUs map value means to the srata-wise dataframe, and combine 
+
+```{r}
+SEF_Biom<-data.frame((NFI_data[(NFI_data$FOREST_STR_NEW=="Semi evergreen forest"&NFI_data$Plot==1),]),MapBiom_Pol=Mapbiomass_SEF)
+SDF_Biom<-data.frame((NFI_data[(NFI_data$FOREST_STR_NEW=="Semi deciduous forest"&NFI_data$Plot==1),]),MapBiom_Pol=Mapbiomass_SDF)
+Mopane_Biom<-data.frame((NFI_data[(NFI_data$FOREST_STR_NEW=="Mopane"&NFI_data$Plot==1),]),MapBiom_Pol=Mapbiomass_MO)
+Mecrusse_Biom<-data.frame((NFI_data[(NFI_data$FOREST_STR_NEW=="Mecrusse"&NFI_data$Plot==1),]),MapBiom_Pol=Mapbiomass_ME)
+All_Biom<-rbind(SEF_Biom,SDF_Biom,Mopane_Biom,Mecrusse_Biom)
+head(All_Biom)
+All_Biom<-All_Biom[order(All_Biom$id_plot_new),]
+table(is.na(All_Biom$MapBiom_Pol))
+```
+
+In this final step, we compute the mean AGB over the plots within the i-th PSU (NFI_cluster_mean) and the mean AGB over the CCI biomass map units within the i-th PSU (MapBiom_Pol). Then, we join all the information in one dataframe `All_Biom_j`
+
+```{r}
+names(NFI_data)<-c(names(NFI_data)[1:9],"subplot_AGB",names(NFI_data)[11:17])
+head(NFI_data)
+AGB_cluster<-NFI_data%>%group_by(Cluster_new)%>%summarise(NFI_cluster_mean=mean(subplot_AGB))
+AGB_cluster<-as.data.frame(AGB_cluster)
+All_Biom_j <- All_Biom %>% inner_join( AGB_cluster, 
+                              by=c('Cluster_new'))
+
+```
+
+Don't forget to save ;)
+
+```{r}
+openxlsx::write.xlsx(All_Biom_j,"Map_cluster_biomassv4_2.xlsx") 
+```
diff --git a/WUR_MOZ_2024/4. Model-assisted estimation.Rmd b/WUR_MOZ_2024/4. Model-assisted estimation.Rmd
new file mode 100644
index 0000000..509405b
--- /dev/null
+++ b/WUR_MOZ_2024/4. Model-assisted estimation.Rmd	
@@ -0,0 +1,292 @@
+---
+title: "4. Model assisted estimation"
+output: html_notebook
+---
+
+This code and text has been adapted from Malaga et al. (under review), "Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries". In the previous three notebooks we have shown how to pre-process Mozambique's NFI dataset and the CCI Biomass map (yr 2017, v.4), as well as computing the mean  AGB mean values over the remote sensing sampling unit (PSU). 
+
+In this notebook, we will perform a model-assisted estimation of AGB (also referred to a scenario B). In order to assess the gain in precision, we will also estimate AGB using exclusively the NFI data (field-based scenario; also referred to as scenario A). The gain in precision from a model-assisted scenario can only be assessed if compared to a baseline scenario, where the CCI biomass map is not used as auxiliary information.
+
+```{r}
+library(openxlsx)
+library(lmtest)
+library(ggplot2)
+library(terra)
+library(dplyr)
+```
+###################################################################################################
+# **PART 1:  Model-assisted estimation** 
+###################################################################################################
+
+# 1. Load the data set
+At this point, the data should already be clean and pre-processed, including mean AGB values for PSUs
+```{r}
+ALL_data=openxlsx::read.xlsx("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/Map_cluster_biomassv4_2.xlsx")  
+head(ALL_data)
+table(ALL_data$MapBiom_Pol<=0) #There are 15 cluster with Map AGB means =0
+```
+
+Subset the data per forest strata
+
+```{r}
+Data_Me<-ALL_data[ALL_data$FOREST_STR_NEW=="Mecrusse",]
+Data_Mo<-ALL_data[ALL_data$FOREST_STR_NEW=="Mopane",]
+Data_sdf<-ALL_data[ALL_data$FOREST_STR_NEW=="Semi deciduous forest",]
+Data_sef<-ALL_data[ALL_data$FOREST_STR_NEW=="Semi evergreen forest",]
+
+```
+
+# 2. Exploring the Map-to-plot regressions 
+
+## Mercrusse
+```{r}
+Me_reg<-lm(NFI_cluster_mean~MapBiom_Pol,data=Data_Me)
+plot(residuals(Me_reg)~fitted(Me_reg),main="residuals vs. fitted",xlab="residuals",ylab = "fitted values")
+```
+## Mopane
+```{r}
+Mo_reg<-lm(NFI_cluster_mean~MapBiom_Pol,data=Data_Mo)
+plot(residuals(Mo_reg)~fitted(Mo_reg),main="residuals vs. fitted",xlab="residuals",ylab = "fitted values")
+```
+## Semi-decidious forest
+```{r}
+sdf_reg<-lm(NFI_cluster_mean~MapBiom_Pol,data=Data_sdf)
+plot(residuals(sdf_reg)~fitted(sdf_reg),main="residuals vs. fitted",xlab="residuals",ylab = "fitted values")
+```
+## Semi-evergreen forest
+```{r}
+sef_reg<-lm(NFI_cluster_mean~MapBiom_Pol,data=Data_sef)
+plot(residuals(sef_reg)~fitted(sef_reg),main="residuals vs. fitted",xlab="residuals",ylab = "fitted values")
+```
+
+Note that for some strata, the residuals of the selected linear models exhibited non-negligible heteroscedasticity according to the Breusch-Pagan test (Zeileis and Hothorn, 2002), for which we corrected following the procedure described in McRoberts et al. (2016). For the sake of keepint this excercise simple, we are not following this procedure here.
+
+# 3. Visualise the models
+
+One visualization example of the regression model
+## Mercrusse forests
+```{r}
+summary(Me_reg)
+ggplot(Data_Me, aes(x = MapBiom_Pol, y = NFI_cluster_mean)) + 
+  geom_point() +
+  geom_smooth(col = "red", method="lm", se = FALSE) +
+  geom_point(shape = 5)+
+  geom_abline(intercept = 0, slope = 1, linetype="dashed")+
+  theme_classic()+
+  theme(axis.text.x=element_text(), axis.text.y=element_text(), 
+        axis.title=element_text(color="black", face="bold"), plot.title = element_text(color="black", face="bold", hjust = 0.5))+
+  ggtitle("MOZAMBIQUE: Mecrusse")+
+  labs(y="AGB plot data [Mg ha-1]")+
+  labs(x="AGB map data [Mg ha-1]")+
+  geom_text(x = 60, y = 180, label = "\U0177 = 78.1+0.5x    R\u00b2 0.01", color="red", parse = F)+
+  scale_x_continuous(limits=c(0,100), expand = c(0, 0))+
+  scale_y_continuous(limits = c(0,220), expand = c(0, 0))+
+  theme(plot.margin = unit(c(0.1, 1, 0.1, 0.2), "cm")) 
+```
+
+# 4. Predicting locally calibrated AGB map mean values (y_caret_ps)
+
+```{r}
+y_hut_ps_Me = predict(lm(NFI_cluster_mean~MapBiom_Pol,data=Data_Me))
+y_hut_ps_Mo = predict(lm(NFI_cluster_mean~MapBiom_Pol,data=Data_Mo))
+y_hut_ps_sdf = predict(lm(NFI_cluster_mean~MapBiom_Pol,data=Data_sdf))
+y_hut_ps_sef = predict(lm(NFI_cluster_mean~MapBiom_Pol,data=Data_sef))
+```
+
+# 5. Estimating the model-assisted Mean and Variance
+
+From MÃ¡alaga et al. (in review): For the model-assisted scenario, we implemented case A two-stage estimators (SÃ¤rndal et al., 1992). The estimator of the population mean consists of the sum of a prediction-based term and a residual-based adjustment term. For wall-to-wall auxiliary data, the prediction-based term is the synthetic estimator calculated as the mean of calibrated map predictions over all PSUs within the population (g) in stratum h (SÃ¤rndal et al., 1992, p. 399), which in this case corresponds to AGB means over the 2x2 map units from our country-calibrated maps. The within-stratum adjustment term (e_mean) is computed as the difference between the AGB mean observations over the plots within the selected ith PSU (NFI_cluster_mean), and their corresponding mean model prediction for the ith PSU (y_hut_ps). 
+
+## i) calculating the residuals  
+
+Calculating the residuals for every PSU
+```{r}
+Me_ei= Data_Me$NFI_cluster_mean-y_hut_ps_Me
+Mo_ei= Data_Mo$NFI_cluster_mean-y_hut_ps_Mo
+sdf_ei= Data_sdf$NFI_cluster_mean-y_hut_ps_sdf
+sef_ei= Data_sef$NFI_cluster_mean-y_hut_ps_sef
+```
+
+
+Calculate the stratum-wise mean residuals
+```{r}
+e_mean_Me<- mean(Me_ei)
+e_mean_Mo<- mean(Mo_ei)
+e_mean_sdf<- mean(sdf_ei)
+e_mean_sef<- mean(sef_ei)
+```
+
+## ii) estimating the synthetic estimator 
+The synthetic estimator is calculated as the mean of calibrated map predictions over all PSUs within the population in each stratum. 
+
+Load the population - 2x2 aggregated raster of the CCI biomass maps, croped to the population of interest (4 strata)
+```{r}
+setwd("C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks") #set the directory for the CCI Biomass values
+CCI_Mo_ag<-terra::rast('CCI_Mopane_agg.tif')
+CCI_Me_ag<-terra::rast('CCI_Mecrusse_agg.tif')
+CCI_SDF_ag<-terra::rast('CCI_SDF_agg.tif')
+CCI_SEF_ag<-terra::rast('CCI_SEF_agg.tif')
+
+```
+Turn rasters into a matrix, to facilitate prediction
+
+```{r}
+CCI_Mo_m<-as.data.frame(terra::as.matrix(CCI_Mo_ag)) 
+colnames(CCI_Mo_m)[1]<-'MapBiom_Pol'
+CCI_Me_m<-as.data.frame(terra::as.matrix(CCI_Me_ag))
+colnames(CCI_Me_m)[1]<-'MapBiom_Pol'
+CCI_sdf_m<-as.data.frame(terra::as.matrix(CCI_SDF_ag)) 
+colnames(CCI_sdf_m)[1]<-'MapBiom_Pol'
+CCI_sef_m<-as.data.frame(terra::as.matrix(CCI_SEF_ag)) 
+colnames(CCI_sef_m)[1]<-'MapBiom_Pol'
+```
+
+calculate the population mean (y_caret pp)
+
+* Mercrusse
+```{r}
+global(CCI_Me_ag,mean,na.rm=TRUE) #This is the mean of the un-calibrated version of the map
+Y_syn_Me = predict(Me_reg, newdata=CCI_Me_m)
+mean(Y_syn_Me,na.rm=TRUE) #This is the mean of the calibrated version of the map
+```
+* Mopane
+```{r}
+global(CCI_Mo_ag,mean,na.rm=TRUE)  ##This is the mean of the un-calibrated version of the map
+Y_syn_Mo = predict(Mo_reg, newdata=CCI_Mo_m)
+mean(Y_syn_Mo,na.rm=TRUE) ##This is the mean of the calibrated version of the map
+```
+
+* Semi-deciduous forests
+```{r}
+global(CCI_SDF_ag,mean,na.rm=TRUE) # #This is the mean of the un-calibrated version of the map
+Y_syn_sdf = predict(sdf_reg, newdata=CCI_sdf_m)
+mean(Y_syn_sdf,na.rm=TRUE)  ##This is the mean of the calibrated version of the map
+```
+
+* Semi-evergreen forests
+```{r}
+global(CCI_SEF_ag,mean,na.rm=TRUE)  # This is the mean of the un-calibrated version of the map
+Y_syn_sef = predict(sef_reg, newdata=CCI_sdf_m)
+mean(Y_syn_sef,na.rm=TRUE)  #This is the mean of the calibrated version of the map
+```
+
+## iii) Estimating the locally calibrated mean and variance
+
+Using the estimator of the population mean (McRoberts et al., in review) 
+
+```{r}
+u_reg_Me <- mean(Y_syn_Me,na.rm=TRUE)+e_mean_Me
+u_reg_Mo <- mean(Y_syn_Mo,na.rm=TRUE)+e_mean_Mo
+u_reg_sdf <- mean(Y_syn_sdf,na.rm=TRUE)+e_mean_sdf
+u_reg_sef <- mean(Y_syn_sef,na.rm=TRUE)+e_mean_sef
+```
+
+Calculate variance
+
+The model-assisted variance estimator is the two-stage variance estimator, assuming the second-stage component of the variance to be negligible, equivalent to (MÃ¡laga et al., 2022) with observations replaced by model prediction residuals.
+
+```{r}
+Var_u_reg_Me=1/(length(Me_ei)*(length(Me_ei)-1))*sum((as.vector(Me_ei)-rep(e_mean_Me,length(Me_ei)))^2)
+Var_u_reg_Me
+```
+
+```{r}
+Var_u_reg_Mo=1/(length(Mo_ei)*(length(Mo_ei)-1))*sum((as.vector(Mo_ei)-rep(e_mean_Mo,length(Mo_ei)))^2)
+Var_u_reg_Mo
+
+```
+
+```{r}
+Var_u_reg_sdf=1/(length(sdf_ei)*(length(sdf_ei)-1))*sum((as.vector(sdf_ei)-rep(e_mean_sdf,length(sdf_ei)))^2)
+Var_u_reg_sdf
+
+```
+
+```{r}
+Var_u_reg_sef=1/(length(sef_ei)*(length(sef_ei)-1))*sum((as.vector(sef_ei)-rep(e_mean_sef,length(sef_ei)))^2)
+Var_u_reg_sef
+
+```
+###########################################################################################
+# **Part 2: Scenario A - Simple Expansion Estimator **
+###########################################################################################
+
+For the field-based scenario (baseline), we implemented a simple expansion estimator. The gain in precision from a model-assisted scenario can only be assessed if compared to a baseline scenario, where the CCI biomass map is not used as auxiliary information.
+
+Mean
+```{r}
+u_Me<-mean(Data_Me$NFI_cluster_mean)
+u_Me
+u_Mo<-mean(Data_Mo$NFI_cluster_mean)
+u_Mo
+u_sdf<-mean(Data_sdf$NFI_cluster_mean)
+u_sdf
+u_sef<-mean(Data_sef$NFI_cluster_mean)
+u_sef
+```
+
+Variance
+```{r}
+var_Me<-var(Data_Me$NFI_cluster_mean)/nrow(Data_Me)
+var_Me
+var_Mo<-var(Data_Mo$NFI_cluster_mean)/nrow(Data_Mo)
+var_Mo
+var_sdf<-var(Data_sdf$NFI_cluster_mean)/nrow(Data_sdf)
+var_sdf
+var_sef<-var(Data_sef$NFI_cluster_mean)/nrow(Data_sef)
+var_sef
+```
+###########################################################################################
+# **PART 3: Country total of the different scenarios** 
+###########################################################################################
+
+# Estimate totals for all forest strata
+
+Count number of cells of per strata and sum into `N_total`
+```{r}
+Nh_Me<-global(CCI_Me_ag,sum,na.rm=TRUE)
+Nh_Mo<-global(CCI_Mo_ag,sum,na.rm=TRUE)
+Nh_sdf<-global(CCI_SDF_ag,sum,na.rm=TRUE)
+Nh_sef<-global(CCI_SEF_ag,sum,na.rm=TRUE)
+N_total<-Nh_Me+Nh_Mo+Nh_sdf+Nh_sef
+```
+
+# Country estimate of the baseline scenario (scenario A)
+
+```{r}
+Mean_Mozam_scA<-Nh_Me/N_total*u_Me+Nh_Mo/N_total*u_Mo+Nh_sdf/N_total*u_sdf+Nh_sef/N_total*u_sef
+Mean_Mozam_scA
+Var_Mozam_scA<- ((Nh_Me/N_total)^2)*var_Me+((Nh_Mo/N_total)^2)*var_Mo+((Nh_sdf/N_total)^2)*var_sdf+((Nh_sef/N_total)^2)*var_sef
+Var_Mozam_scA
+```
+
+# Country estimate of the model-assisted scenario (scenario B)
+
+```{r}
+Mean_Mozam_scB<-Nh_Me/N_total*u_reg_Me+Nh_Mo/N_total*u_reg_Mo+Nh_sdf/N_total*u_reg_sdf+Nh_sef/N_total*u_reg_sef
+Mean_Mozam_scB
+Var_Mozam_scB<- ((Nh_Me/N_total)^2)*Var_u_reg_Me+((Nh_Mo/N_total)^2)*Var_u_reg_Mo+((Nh_sdf/N_total)^2)*Var_u_reg_sdf+((Nh_sef/N_total)^2)*Var_u_reg_sef
+Var_Mozam_scB
+```
+
+
+###########################################################################################
+# **PART 4: Assessing the gain in precision ** 
+###########################################################################################
+
+
+The degree of improvement is expressed by means of relative efficiency (RE, Eq 15 in Malaga et al (under review), where values greater than 1.0 indicate a gain in precision (smaller variance) 
+
+```{r}
+var_Me/Var_u_reg_Me
+var_Mo/Var_u_reg_Mo
+var_sdf/Var_u_reg_sdf
+var_sef/Var_u_reg_sef
+Var_Mozam_scA/Var_Mozam_scB
+```
+Output values indicate a gain in precision (smaller varience)  under the model-assisted scenario (scenario B). 
+
+References: 
+- Malaga et al. (under review), Global biomass maps can increase the precision of (sub)national aboveground biomass estimates: a comparison across tropical countries.
+- Malaga et al. (2022), Precision of subnational forest AGB estimates within the Peruvian Amazonia using a global biomass map. International Journal of Applied Earth Observation and Geoinformation, 115, 103102. https://doi.org/10.1016/j.jag.2022.103102
+- McRoberts et al. (2016). Hybrid estimators for mean aboveground carbon per unit area. Forest Ecology and Management, 378, 44â€“56.
diff --git a/WUR_MOZ_2024/Mozambique_MA.ipynb b/WUR_MOZ_2024/Mozambique_MA.ipynb
new file mode 100644
index 0000000..4d6b378
--- /dev/null
+++ b/WUR_MOZ_2024/Mozambique_MA.ipynb
@@ -0,0 +1,361 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3db00a28-01c6-48be-8f0a-dd5ec87dde7c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "terra 1.7.55\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "###########################\n",
+    "# title: \"Downloading and pre-processing Mozambique's NFI data\"\n",
+    "# output: html_notebook\n",
+    "###########################\n",
+    "\n",
+    "# This code and text has been adapted from Malaga et al. (under review), \"Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries\". This study has been carried out in collaboration with the MRV Unit, National Sustainable Development Fund (FNDS) in Mozambique, among many other collaborators. In order to use Mozambique's NFI data, careful attention must be taken to the NFI's sampling design, forest strata, among many other factors. The objective of this notebook is to show the steps followed to prepare Mozambique's NFI dataset for its future integration with CCI Biomass data. \n",
+    "\n",
+    "# The Mozambique NFI followed a restricted stratified random sampling approach, with the stratification based on an [agroecological map](https://www.fnds.gov.mz/mrv/index.php/documentos/relatorios/26-inventario-florestal-nacional/fileMaputo). The forest strata are:\n",
+    "\n",
+    "# * Mopane forests\n",
+    "# * Mercrusse forests\n",
+    "# * Semi-decidouous forests\n",
+    "# * Semi-evergreen forests\n",
+    "\n",
+    "# Plot locations were initally randomly selected but later decided to move to random locations to grid intersections. Upon visual inspection, some clusters of the original design were further re-located, because of discrepancies between the map forest strata and the ground conditions. Each cluster connsists of four 50m-by-20m (0,1 ha) plots at 50 m spacing configured in a square, each divided in four subplots of 25m-by-10m. All trees with DBH greater than 10 cm were measured within a plot and trees with DBH greater than 5 cm and lower than 10 cm were measured within the first subplot.\n",
+    "\n",
+    "# setwd(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks\")\n",
+    "# knitr::include_graphics(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/Mozambique NFI.png\")\n",
+    "\n",
+    "# The pre-processing steps followed here entail filtering out clusters with missing or duplicated coordinates. In addition, clusters that did not fall within the four forest strata were disregarded. Clusters crossing stratum boundaries were assigned to the majority clusters. \n",
+    "\n",
+    "# Loading packages\n",
+    "library(openxlsx)\n",
+    "library(sf)\n",
+    "library(dplyr)\n",
+    "library(geosphere)\n",
+    "library(lmtest)\n",
+    "library(ggplot2)\n",
+    "library(terra)\n",
+    "library(sp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3eee580-c9e2-42ee-a7de-5b550be11621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Loading data\n",
+    "\n",
+    "# Load the NFI database\n",
+    "\n",
+    "NFI_data=openxlsx::read.xlsx(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Data_IFN_plt.xlsx\")\n",
+    "\n",
+    "# Load the Forest stratification layers\n",
+    "\n",
+    "Moz_strata_geo<-st_read(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Strata_reclassify_geogra.shp\")  #WGS_84"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6786a813-8715-4bf1-b9e4-b27afdd00dd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Filtering out NA coordinates within the NFI plots\n",
+    "\n",
+    "# The original dataset includes 3272 rows with information (3272 plots)\n",
+    "# There are 3272 rows with coordinates and 148 with no coordinates\n",
+    "\n",
+    "table(is.na(NFI_data$Ycoordinate))\n",
+    "\n",
+    "# Remove the rows with NA coordinates and save new DataFrame as `NFI_data_c`\n",
+    "\n",
+    "NFI_data_c<-NFI_data[!is.na(NFI_data$Xcoordinate),] #remove NA coordinates\n",
+    "NFI_data_c<-NFI_data[!is.na(NFI_data$Ycoordinate),] #remove NA coordinates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83b7bd50-026d-4d11-9eb7-642953c17b78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. Filtering out duplicated clusters  \n",
+    "# To identify plot with duplicated coordinates, we will transform `NFI_data_c` to a Large SpatialPointsDataFrame\n",
+    "\n",
+    "coordinates(NFI_data_c) =~ Xcoordinate + Ycoordinate \n",
+    "\n",
+    "\n",
+    "Then, we will define the projection: WGS84\n",
+    "\n",
+    "crs_wgs84 <- CRS(SRS_string = \"EPSG:4326\") # WGS 84 has EPSG code 4326\n",
+    "slot(NFI_data_c, \"proj4string\") <- crs_wgs84\n",
+    "\n",
+    "# Create `TestFram` to identify duplicated coordinated, which are a total of 8 plots with duplicated coordinates\n",
+    "\n",
+    "TestFram=paste((NFI_data_c@coords[,1]),as.numeric(NFI_data_c@coords[,2]))\n",
+    "TestFrameCounts<-as.data.frame(table(TestFram))\n",
+    "TestFrameCounts[TestFrameCounts$Freq==max(TestFrameCounts$Freq),] #identify plots with duplicated \n",
+    "\n",
+    "# The coordinates in `TestFram` will be used to identify the plots/clusters that need to be removed\n",
+    "\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.4935005878\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.4935032236\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.4944496012999\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.4944522421\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.5660090220999\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.566011865\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.5669523366999\"]\n",
+    "NFI_data_c$id_parcela[NFI_data_c@coords[,1]==\"33.5669551842999\"]\n",
+    "\n",
+    "# The 8 duplicated plots belong to the following clusters: \n",
+    "# * 216 & 218\n",
+    "# * 265 & 269\n",
+    "# We will remove clusters 218 and 269 and save the new dataframe as `NFI_Data_c2`\n",
+    "\n",
+    "NFI_data_c2<-NFI_data_c[!(NFI_data_c$Cluster%in%c(218,269)),]\n",
+    "nrow(NFI_data_c)-nrow(NFI_data_c2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c21bf6f1-6f49-49ea-ba62-6cff1aeab3b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. Incorporating the Stratification information to the NFI database\n",
+    "\n",
+    "# The population of interest was determined by the original stratification layer used in the sampling design: the 2008 re-classified agro-ecological map (MITADER, 2018). The forest strata (classes) were ruled by the stratification layer. The forest strata are: Mopane (Mo), Mecrusse (Me), Semi-deciduous forest (SDF) and Semi-evergreen forests (SEF). \n",
+    "# First, convert to sf data frame for st_join to work\n",
+    "\n",
+    "NFI_data_strata = st_as_sf(x=NFI_data_c2, \n",
+    "                    coords = c(\"Xcoordinate\", \"Ycoordinate\"),\n",
+    "                    crs = crs(Moz_strata_geo)) #Geographic   \n",
+    " \n",
+    "                    \n",
+    "# Check that projections are the same\n",
+    "\n",
+    "st_crs(Moz_strata_geo)==st_crs(NFI_data_strata)\n",
+    "\n",
+    "# Examine the stratification layer\n",
+    "\n",
+    "summary(Moz_strata_geo)\n",
+    "\n",
+    "# We're interested in the Forest Strata, therefore we will extract `FOREST_STR` from `Moz_strata_geo` and save as `NFI_data_strata`\n",
+    "\n",
+    "Moz_strata_layer<-Moz_strata_geo[,\"FOREST_STR\"]\n",
+    "sf::sf_use_s2(FALSE) # needed when using geographic coord but not UTM\n",
+    "NFI_data_strata=st_join(NFI_data_strata, Moz_strata_layer) # join/extract values of shapefile to spatial points\n",
+    "\n",
+    "# Remove funny duplicated rows that appear from the previous procedure: \n",
+    "\n",
+    "NFI_data_strata[duplicated(NFI_data_strata),]\n",
+    "NFI_data_strata = NFI_data_strata[!duplicated(NFI_data_strata),]\n",
+    "nrow(NFI_data_strata)\n",
+    "as.data.frame(NFI_data_strata)\n",
+    "\n",
+    "# Convert it back to a dataframe that includes the stratification info per plot\n",
+    "\n",
+    "NFI_data_df=NFI_data_strata %>% st_drop_geometry()\n",
+    "NFI_data_df = as.data.frame(NFI_data_df)\n",
+    "NFI_data_df$x=st_coordinates(NFI_data_strata)[,1]#re-assign the coord X\n",
+    "NFI_data_df$y=st_coordinates(NFI_data_strata)[,2]#re-assign the coord Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "217e2df7-f2f3-42c1-a4e1-9cf145ff208b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. Re-define strata so all clusters fall within the same strata.\n",
+    "\n",
+    "# \"Clusters crossing stratum boundaries were assigned to the majority stratum\" \n",
+    "# Ruling: if there are NA plots within a cluster, assign the class corresponding to the first plot within the cluster. If the first plot is class NA, than use the next plot with a valid class... If all plots within a cluster are NA, than maintain the NA for the whole cluster\n",
+    "# Create `FOREST_STR_NEW`\n",
+    "\n",
+    "NFI_data_df$FOREST_STR_NEW=NA\n",
+    "NFI_data_df[NFI_data_df$FOREST_STR == \"NA\",]$FOREST_STR=NA #The function below needs for all \"NA\"s to be turned into real NAs\n",
+    "\n",
+    "# Re-assign the forest strata to the original data set as explained above\n",
+    "\n",
+    "for(i in unique(NFI_data_df$Cluster_new)){\n",
+    "  \n",
+    "  selected = which(NFI_data_df$Cluster_new == i)\n",
+    "  \n",
+    "  NFI_data_df[selected,]$FOREST_STR_NEW = ifelse(is.null(names(which.max(table(c(NFI_data_df[selected,]$FOREST_STR))))),NA,NFI_data_df[selected,]$FOREST_STR[!is.na(NFI_data_df[selected,]$FOREST_STR)][1])\n",
+    "}\n",
+    "\n",
+    "# Additional step: remove clusters with Strata=NA\n",
+    "# \"Clusters that did not fall within the four forest strata as defined by the re-classified Agro-ecological map (our area of interest) were disregarded, assuming they had been moved and departed from the original design.\"\n",
+    "\n",
+    "NFI_data_df[(is.na(NFI_data_df$FOREST_STR_NEW)),c(4,6,12,15)] # daniela: change columns if these are not the ones of interest\n",
+    "\n",
+    "# Remove rows with NA for strata\n",
+    "\n",
+    "NFI_data_n=NFI_data_df[!is.na(NFI_data_df$FOREST_STR_NEW),]\n",
+    "length(NFI_data_df$FOREST_STR_NEW)-length(NFI_data_n$FOREST_STR_NEW)\n",
+    "\n",
+    "# Save the cleaned database `NFI_data_n`\n",
+    "\n",
+    "openxlsx::write.xlsx(NFI_data_n, (\"Data_INF_plt_clean_strata_noNA2.xlsx\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b1be31e-f763-4dcc-a166-cf1c97127574",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###########################\n",
+    "# title: \"2. CCI biomass Pre-processing steps\"\n",
+    "# output: html_notebook\n",
+    "###########################\n",
+    "\n",
+    "# This code and text has been adapted from Malaga et al. (under review), \"Global biomass maps can increase the precision of 1 (sub)national aboveground biomass estimates: a 2 comparison across tropical countries\". In the previous notebook (\"1. Downloading and pre-processing Mozambique's NFI data\"), we have explained the pre-processing steps needed to adapt Mozambique's NFI for its integration with CCI Biomass data. \n",
+    "\n",
+    "# In this notebook, we will show how we adapted the European Space Angency Climate Change Initiative (CCI) version 4 [global biomass maps](https://catalogue.ceda.ac.uk/uuid/af60720c1e404a9e9d2c145d2b2ead4e) for its subsequent integration with Mozambique's NFI. To minimize discrepancies product of temporal mismatches, the CCI Biomass epoch closest to Mozambique's NFI implementation period was chosen, which was CCI Biomass year 2017 (v.4). \n",
+    "\n",
+    "#Loading packages\n",
+    "\n",
+    "library(sp)\n",
+    "library(sf)\n",
+    "library(terra)\n",
+    "library(raster)\n",
+    "\n",
+    "# 0. Download CCI Biomass (2017 v4) for Mozambique \n",
+    "# Tiles of CCI Biomass year 2017 v4 can be found in the following link: [https://data.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v4.0/geotiff/2017](https://data.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v4.0/geotiff/2017). Save tiles into your working directory. \n",
+    "\n",
+    "# Load tiles from your working directory\n",
+    "\n",
+    "setwd(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/CCI 2017 v4\") #set the your working directory\n",
+    "\n",
+    "tile1<-raster(\"S10E030_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif\")\n",
+    "tile2<-raster(\"S10E040_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif\")\n",
+    "tile3<-raster(\"S20E030_ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv4.0.tif\")\n",
+    "\n",
+    "# 1. Merge the original CCI map tiles into a Mosaic `CCI_2017` and save\n",
+    "\n",
+    "CCI_2017<-merge(tile1,tile2,tile3) #this process might take some time\n",
+    "plot(CCI_2017)\n",
+    "\n",
+    "\n",
+    "# 2. Crop the CCI mosaic to the extent of the stratification layer (our defined population)\n",
+    "\n",
+    "# Load the stratification layer and check its extent and coordinate reference system\n",
+    "\n",
+    "Moz_strata_geo<-st_read(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Strata_reclassify_geogra.shp\") #WGS_84\n",
+    "extent(Moz_strata_geo)\n",
+    "plot(Moz_strata_geo[\"FOREST_STR\"])\n",
+    "\n",
+    "# Crop `CCI_2017`, and then force again the extent in order for the raster to overlay \n",
+    "\n",
+    "CCI_crop<-crop(CCI_2017, extent(Moz_strata_geo)) #crop the biomass map to the  extent of the study area\n",
+    "extent(Moz_strata_geo)==extent(CCI_crop) #check that extents match\n",
+    "bb <- extent(Moz_strata_geo)\n",
+    "extent(CCI_crop) <- bb\n",
+    "extent(Moz_strata_geo)==extent(CCI_crop) #check again if extents match now\n",
+    "\n",
+    "# 3. Crop the CCI Biomass into Mozambique's 4 forest strata \n",
+    "\n",
+    "# As seen above, there are four types of Forest strata: Semi decidious forest, Semi evergreen forests, Mopane forests and Mercrusse forests. Let's mask `CCI crop` for each Forest strata\n",
+    "# * Mopane forests\n",
+    "\n",
+    "library(terra)\n",
+    "mask_Mopane<-Moz_strata_geo[Moz_strata_geo$FOREST_STR==\"Mopane\",]\n",
+    "CCI_Mopane <- terra::crop(CCI_crop, mask_Mopane, mask=TRUE) \n",
+    "CCI_Mopane <- as(CCI_Mopane, \"SpatRaster\") # read as SpatRaster\n",
+    "mask_Mopane <- vect(mask_Mopane) #read as terra-friendly format (SpatVector)\n",
+    "CCI_Mopane <- terra::mask(CCI_Mopane, mask_Mopane)\n",
+    "plot(CCI_Mopane)\n",
+    "\n",
+    "\n",
+    "# * Mercrusse forests\n",
+    "mask_Mecrusse<-Moz_strata_geo[Moz_strata_geo$FOREST_STR==\"Mecrusse\",]\n",
+    "CCI_Mecrusse <- terra::crop(CCI_crop, mask_Mecrusse, mask=TRUE) \n",
+    "CCI_Mecrusse <- as(CCI_Mecrusse, \"SpatRaster\") # read as SpatRaster\n",
+    "mask_Mecrusse <- vect(mask_Mecrusse) #read as terra-friendly format (SpatVector)\n",
+    "CCI_Mecrusse <- terra::mask(CCI_Mecrusse, mask_Mecrusse)\n",
+    "plot(CCI_Mecrusse)\n",
+    "\n",
+    "\n",
+    "# * Semi deciduous forests\n",
+    "mask_SDF<-Moz_strata_geo[Moz_strata_geo$FOREST_STR==\"Semi deciduous forest\",]\n",
+    "CCI_SDF <- terra::crop(CCI_crop, mask_SDF, mask=TRUE) \n",
+    "CCI_SDF <- as(CCI_SDF, \"SpatRaster\") # read as SpatRaster\n",
+    "mask_SDF <- vect(mask_SDF) #read as terra-friendly format (SpatVector)\n",
+    "CCI_SDF <- terra::mask(CCI_SDF, mask_SDF)\n",
+    "plot(CCI_SDF)\n",
+    "\n",
+    "\n",
+    "# * Semi evergreen forests\n",
+    "mask_SEF<-Moz_strata_geo[Moz_strata_geo$FOREST_STR==\"Semi evergreen forest\",]\n",
+    "CCI_SEF <- terra::crop(CCI_crop, mask_SEF, mask=TRUE) \n",
+    "CCI_SEF <- as(CCI_SEF, \"SpatRaster\") # read as SpatRaster\n",
+    "mask_SEF <- vect(mask_SEF) #read as terra-friendly format (SpatVector)\n",
+    "CCI_SEF <- terra::mask(CCI_SEF, mask_SEF)\n",
+    "plot(CCI_SEF)\n",
+    "\n",
+    "# Don't forget to save ;)\n",
+    "writeRaster(CCI_Mopane, \"CCI_Mopane.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_Mecrusse, \"CCI_Mecrusse.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_SDF, \"CCI_SDF.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_SEF, \"CCI_SEF.tif\", overwrite=TRUE)\n",
+    "\n",
+    "\n",
+    "# 4.  Aggregate the per-stratum maps to define the population\n",
+    "# A PSU consists of a square polygon whose dimensions are large enough to encompass all plots within a cluster. See figure of Mozambique's plot design below. To obtain PSU-equivalent map AGB estimates for the population (synthetic estimator, later seen in the estimator), CCI map units were aggregated to polygons of the same size as defined by the PSU. This implies aggregating the map by a factor of 2. \n",
+    "setwd(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks\")\n",
+    "knitr::include_graphics(\"C:/Users/Daniela/Nextcloud/Documents/Collaborations/Natalia Malaga/Mozambique_code_example/Mozambique code/Notebooks/Mozambique NFI.png\")\n",
+    "\n",
+    "\n",
+    "# Aggregate the map y a factor of 2\n",
+    "CCI_Mo_ag<-aggregate(CCI_Mopane,fact=2,fun=mean,na.rm=TRUE) \n",
+    "CCI_Me_ag<-aggregate(CCI_Mecrusse,fact=2,fun=mean,na.rm=TRUE)\n",
+    "CCI_SDF_ag<-aggregate(CCI_SDF,fact=2,fun=mean,na.rm=TRUE)\n",
+    "CCI_SEF_ag<-aggregate(CCI_SEF,fact=2,fun=mean,na.rm=TRUE)\n",
+    "\n",
+    "# Don't forget to save aggregated rasters ;)\n",
+    "writeRaster(CCI_Mo_ag, \"CCI_Mopane_agg2.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_Me_ag, \"CCI_Mecrusse_agg2.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_SDF_ag,\"CCI_SDF_agg2.tif\", overwrite=TRUE)\n",
+    "writeRaster(CCI_SEF_ag,\"CCI_SEF_agg2.tif\", overwrite=TRUE)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "4.2.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab