added post-processing scripts

5d334c86 · Ian · ebe4e4b4 · 5d334c86 · 5d334c86 · 5d334c86
Commit 5d334c86 authored 8 months ago by Ian
--- a/processing/apply_k_allom_adjustment.sql
+++ b/processing/apply_k_allom_adjustment.sql
+BEGIN;
+-- Add a new column to the menlove_healey_biohex table
+ALTER TABLE menlove_healey_biohex
+ADD COLUMN biwf_temp_conifer_na_adj NUMERIC,
+ADD COLUMN biwf_temp_broadleaf_na_adj NUMERIC,
+ADD COLUMN biwf_temp_conifer_all_adj NUMERIC,
+ADD COLUMN biwf_temp_broadleaf_all_adj NUMERIC;
+
+-- Update the new columns with the values from the original columns
+UPDATE menlove_healey_biohex
+SET biwf_temp_conifer_na_adj = biwf_temp_conifer_na * 2.56,
+biwf_temp_broadleaf_na_adj = biwf_temp_broadleaf_na * 0.58,
+biwf_temp_conifer_all_adj = biwf_temp_conifer_all * 1.47,
+biwf_temp_broadleaf_all_adj = biwf_temp_broadleaf_all * 1;
+
+COMMIT;
+
+
--- a/processing/calc_cell_averages.sql
+++ b/processing/calc_cell_averages.sql
+-- Set the number of parallel workers (adjust as needed)
+SET max_parallel_workers_per_gather = 16;
+SET max_parallel_workers = 16;
+
+-- Create a temporary table to store the results
+BEGIN;
+CREATE TEMPORARY TABLE temp_results AS
+SELECT 
+    conus_cells.fid AS cell_id,
+    AVG(fr.biwf) AS avg_biwf
+FROM 
+    conus_cells
+    JOIN fourth_run_results fr ON ST_Contains(conus_cells.geom, fr.geom)
+GROUP BY 
+    conus_cells.fid;
+COMMIT;
+
+BEGIN;
+-- Add a new column to the conus_cells table
+ALTER TABLE conus_cells ADD COLUMN biwf_fourth_run NUMERIC;
+
+-- Update the new column with the calculated average values
+UPDATE conus_cells c
+SET biwf_fourth_run = tr.avg_biwf
+FROM temp_results tr
+WHERE c.fid = tr.cell_id;
+
+-- Drop the temporary table
+DROP TABLE temp_results;
+COMMIT;
--- a/processing/calc_menlove_averages.sql
+++ b/processing/calc_menlove_averages.sql
+-- Set the number of parallel workers (adjust as needed)
+SET max_parallel_workers_per_gather = 16;
+SET max_parallel_workers = 16;
+
+-- Create a temporary table to store the results
+BEGIN;
+CREATE TEMPORARY TABLE temp_results AS
+SELECT 
+    mhb.ushexes_id AS hex_id,
+    AVG(results.biwf) AS avg_biwf
+FROM 
+    menlove_healey_biohex mhb
+JOIN 
+    conus_raster_params results
+ON 
+    ST_Contains(mhb.geom, results.geom)
+GROUP BY 
+    mhb.ushexes_id;
+
+COMMIT;
+
+BEGIN;
+-- Add a new column to the menlove_healey_biohex table
+ALTER TABLE menlove_healey_biohex
+ADD COLUMN biwf_conus_raster_params NUMERIC;
+
+-- Update the new column with the calculated average values
+UPDATE menlove_healey_biohex mhb
+SET 
+    biwf_conus_raster_params = tr.avg_biwf
+FROM temp_results tr
+WHERE mhb.ushexes_id = tr.hex_id;
+
+-- Drop the temporary table
+DROP TABLE temp_results;
+
+COMMIT;
--- a/processing/calc_menlove_count.sql
+++ b/processing/calc_menlove_count.sql
+-- Calculate the count of fifth_run_results points within
+-- each menlove_healey_biohex polygon and store the results
+
+BEGIN;
+CREATE TEMPORARY TABLE temp_results AS
+SELECT 
+    mhb.ushexes_id AS hex_id,
+    COUNT(fr.biwf) AS count_biwf_fifth_run,
+    COUNT(fr.l4_agbd) AS count_l4_agbd
+
--- a/processing/load_into_postgis.sh
+++ b/processing/load_into_postgis.sh
+#!/bin/bash
+
+# Check if correct number of arguments is provided
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <input_directory> <target_table>"
+    exit 1
+fi
+
+INPUT_DIR="$1"
+TARGET_TABLE="$2"
+BATCH_SIZE=10
+
+# Set your PostgreSQL connection parameters
+PG_HOST="localhost"
+PG_PORT="5432"
+PG_DB="nmbim_results"
+PG_USER="ian"
+PG_PASS="grant"
+
+# Check if input directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+    echo "Error: Input directory does not exist."
+    exit 1
+fi
+
+# Debugging: Check for .gpkg files
+echo "Checking for .gpkg files in $INPUT_DIR"
+find "$INPUT_DIR" -name "*.gpkg" | head -n 5
+
+# Debugging: List all .gpkg files and count
+echo "Listing found .gpkg files:"
+find "$INPUT_DIR" -name "*.gpkg"
+echo "Number of .gpkg files found: $(find "$INPUT_DIR" -name "*.gpkg" | wc -l)"
+
+# Debugging: Check for hidden characters in filenames
+echo "Listing files with special characters visible:"
+find "$INPUT_DIR" -name "*.gpkg" -print0 | xargs -0 ls -b
+
+# Function to merge and load a batch of GeoPackages
+process_batch() {
+    local batch_num="$1"
+    shift
+    local input_gpkgs=("$@")
+    
+    local output_gpkg="/vsimem/merged_batch_${batch_num}.gpkg"
+    echo "Merging batch $batch_num"
+    # Use ogrmerge.py to merge the GeoPackages
+    ogrmerge.py -o "$output_gpkg" -f GPKG "${input_gpkgs[@]}" -overwrite_ds -single
+    if [ $? -ne 0 ]; then
+        echo "Error merging batch $batch_num"
+        return 1
+    fi
+
+    echo "Loading in-memory GeoPackage into PostGIS"
+    ogr2ogr -f PostgreSQL PG:"host=$PG_HOST port=$PG_PORT dbname=$PG_DB user=$PG_USER password=$PG_PASS" \
+        "$output_gpkg" \
+        -nln "$TARGET_TABLE" \
+        -append \
+        -update \
+        -lco COPY_WKB=YES \
+        -skipfailures
+    if [ $? -ne 0 ]; then
+        echo "Error loading batch $batch_num into PostGIS"
+        return 1
+    fi
+
+    # Cleanup in-memory file
+    gdal_translate -f MEM /vsimem/null "$output_gpkg" -q
+    gdal_translate -f MEM /vsimem/null /vsimem/null -q
+    echo "Batch $batch_num completed"
+}
+
+export -f process_batch
+export PG_HOST PG_PORT PG_DB PG_USER PG_PASS TARGET_TABLE
+
+# Find all GeoPackages (compatible with older Bash versions)
+IFS=$'\n' read -r -d '' -a gpkg_files < <(find "$INPUT_DIR" -name "*.gpkg" && printf '\0')
+total_files=${#gpkg_files[@]}
+
+# Debugging: Check array content
+echo "Number of files in gpkg_files array: ${#gpkg_files[@]}"
+echo "First few files:"
+printf '%s\n' "${gpkg_files[@]:0:5}"
+
+# Debugging: Check if ogrmerge.py can see the files
+if [ ${#gpkg_files[@]} -gt 0 ]; then
+    echo "Testing ogrmerge.py with first file:"
+    first_file="${gpkg_files[0]}"
+    ogrmerge.py -list "$first_file"
+else
+    echo "No .gpkg files found in the array"
+fi
+
+# Debugging: Verify GDAL installation
+echo "GDAL version:"
+gdalinfo --version
+
+# Determine number of CPU cores and set max parallel jobs
+max_jobs=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
+echo "Using up to $max_jobs parallel jobs"
+
+# Create batches and process in parallel
+seq 0 $BATCH_SIZE $((total_files - 1)) | \
+parallel --jobs $max_jobs --halt now,fail=1 --joblog parallel.log --eta \
+'
+    batch_num={#}
+    start_index={}
+    end_index=$((start_index + BATCH_SIZE - 1))
+    if [ $end_index -ge '"$total_files"' ]; then
+        end_index=$(('"$total_files"' - 1))
+    fi
+    batch_files=("${gpkg_files[@]:$start_index:$BATCH_SIZE}")
+    process_batch "$batch_num" "${batch_files[@]}"
+'
+
+echo "All GeoPackages processed and loaded into PostGIS table: $TARGET_TABLE"
--- a/processing/load_into_postgis_non_parallel.sh
+++ b/processing/load_into_postgis_non_parallel.sh
+#!/bin/bash
+
+# Check if correct number of arguments is provided
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <input_directory> <target_table>"
+    exit 1
+fi
+
+INPUT_DIR="$1"
+TARGET_TABLE="$2"
+BATCH_SIZE=10
+
+# Set your PostgreSQL connection parameters
+PG_HOST="localhost"
+PG_PORT="5432"
+PG_DB="nmbim_results"
+PG_USER="ian"
+PG_PASS="grant"
+
+# Check if input directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+    echo "Error: Input directory does not exist."
+    exit 1
+fi
+
+# Debugging: Check for .gpkg files
+echo "Checking for .gpkg files in $INPUT_DIR"
+find "$INPUT_DIR" -name "*.gpkg" | head -n 5
+
+# Debugging: List all .gpkg files and count
+echo "Listing found .gpkg files:"
+find "$INPUT_DIR" -name "*.gpkg"
+echo "Number of .gpkg files found: $(find "$INPUT_DIR" -name "*.gpkg" | wc -l)"
+
+# Debugging: Check for hidden characters in filenames
+echo "Listing files with special characters visible:"
+find "$INPUT_DIR" -name "*.gpkg" -print0 | xargs -0 ls -b
+
+# Function to merge and load a batch of GeoPackages
+process_batch() {
+    local batch_num="$1"
+    shift
+    local input_gpkgs=("$@")
+    
+    local output_gpkg="merged_batch_${batch_num}.gpkg"
+    echo "Merging batch $batch_num"
+    echo "Input files for this batch:"
+    printf '%s\n' "${input_gpkgs[@]}"
+    
+    # Check file permissions and existence
+    for file in "${input_gpkgs[@]}"; do
+        if [ ! -r "$file" ]; then
+            echo "Error: Cannot read file $file"
+        fi
+        if [ ! -s "$file" ]; then
+            echo "Error: File $file is empty"
+        fi
+    done
+    
+    # Use ogrinfo to check each file
+    for file in "${input_gpkgs[@]}"; do
+        echo "Checking $file with ogrinfo:"
+        ogrinfo -so "$file"
+    done
+    
+    # Use ogrmerge.py to merge the GeoPackages
+    echo "Running ogrmerge.py command:"
+    echo ogrmerge.py -o "$output_gpkg" -f GPKG "${input_gpkgs[@]}" -overwrite_ds -single
+    ogrmerge.py -o "$output_gpkg" -f GPKG "${input_gpkgs[@]}" -overwrite_ds -single
+    if [ $? -ne 0 ]; then
+        echo "Error merging batch $batch_num"
+        return 1
+    fi
+    ogrinfo $output_gpkg
+
+    echo "Loading in-memory GeoPackage into PostGIS"
+    ogr2ogr -f PostgreSQL PG:"host=$PG_HOST port=$PG_PORT dbname=$PG_DB user=$PG_USER password=$PG_PASS" \
+        "$output_gpkg" \
+        -nln "$TARGET_TABLE" \
+        -append \
+        -update \
+        -lco COPY_WKB=YES \
+        -skipfailures
+    if [ $? -ne 0 ]; then
+        echo "Error loading batch $batch_num into PostGIS"
+        return 1
+    fi
+
+    # Cleanup in-memory file
+    gdal_translate -f MEM /vsimem/null "$output_gpkg" -q
+    gdal_translate -f MEM /vsimem/null /vsimem/null -q
+    echo "Batch $batch_num completed"
+}
+
+# Find all GeoPackages
+mapfile -d $'\0' gpkg_files < <(find "$INPUT_DIR" -name "*.gpkg" -print0)
+total_files=${#gpkg_files[@]}
+
+# Debugging: Check array content
+echo "Number of files in gpkg_files array: ${#gpkg_files[@]}"
+echo "First few files:"
+printf '%s\n' "${gpkg_files[@]:0:5}"
+
+# Debugging: Check if ogrinfo can see the files
+if [ ${#gpkg_files[@]} -gt 0 ]; then
+    echo "Testing ogrinfo with first file:"
+    first_file="${gpkg_files[0]}"
+    ogrinfo -so "$first_file"
+else
+    echo "No .gpkg files found in the array"
+fi
+
+# Debugging: Verify GDAL installation
+echo "GDAL version:"
+gdalinfo --version
+
+# Process batches sequentially
+for ((i=0; i<total_files; i+=BATCH_SIZE)); do
+    batch_num=$((i / BATCH_SIZE + 1))
+    end=$((i + BATCH_SIZE))
+    if [ $end -gt $total_files ]; then
+        end=$total_files
+    fi
+    batch_files=("${gpkg_files[@]:i:BATCH_SIZE}")
+    process_batch "$batch_num" "${batch_files[@]}"
+    if [ $? -ne 0 ]; then
+        echo "Error processing batch $batch_num. Stopping execution."
+        exit 1
+    fi
+done
+
+echo "All GeoPackages processed and loaded into PostGIS table: $TARGET_TABLE"