diff --git a/__pycache__/get_gedi_data_fsspec.cpython-312.pyc b/__pycache__/get_gedi_data_fsspec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dd145893831fe9024ae578c30736fba7982492c Binary files /dev/null and b/__pycache__/get_gedi_data_fsspec.cpython-312.pyc differ diff --git a/get_gedi_data_fsspec.py b/get_gedi_data_fsspec.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee0fbb4b2a4d3687d717c6f1c278e7fce9eb96c --- /dev/null +++ b/get_gedi_data_fsspec.py @@ -0,0 +1,61 @@ + +import sys +import h5py +import boto3 +import botocore +import fsspec +import requests +from maap.maap import MAAP +maap = MAAP(maap_host="api.maap-project.org") +import os + + +def assume_role_credentials(ssm_parameter_name): + # Create a session using your current credentials + session = boto3.Session() + + # Retrieve the SSM parameter + ssm = session.client('ssm', "us-west-2") + parameter = ssm.get_parameter( + Name=ssm_parameter_name, + WithDecryption=True + ) + parameter_value = parameter['Parameter']['Value'] + + # Assume the DAAC access role + sts = session.client('sts') + assumed_role_object = sts.assume_role( + RoleArn=parameter_value, + RoleSessionName='TutorialSession' + ) + + # From the response that contains the assumed role, get the temporary + # credentials that can be used to make subsequent API calls + credentials = assumed_role_object['Credentials'] + + return credentials + +# We can pass assumed role credentials into fsspec +def fsspec_access(credentials): + return fsspec.filesystem( + "s3", + key=credentials['AccessKeyId'], + secret=credentials['SecretAccessKey'], + token=credentials['SessionToken'] + ) + +def lpdaac_gedi_https_to_s3(url): + dir_comps = url.split("/") + return f"s3://lp-prod-protected/{dir_comps[6]}/{dir_comps[8].strip('.h5')}/{dir_comps[8]}" + +def get_gedi_data(url): + s3_fsspec = fsspec_access(assume_role_credentials("/iam/maap-data-reader")) + basename = os.path.basename(url) + outfp = f"output/{basename}" + gedi_ds = h5py.File(s3_fsspec.open(lpdaac_gedi_https_to_s3(url)), "r") + with h5py.File(outfp, 'w') as dst: + for obj in gedi_ds.keys(): + gedi_ds.copy(obj, dst) + gedi_ds.close() + # Return filepath! + return outfp diff --git a/main.py b/main.py index 7b7ae49533b26fa369e2cc7ebf51c82d1a75c3c0..b0f0536368336f30f7b695b8b9edc48902c7444d 100644 --- a/main.py +++ b/main.py @@ -20,7 +20,9 @@ from itertools import repeat from pgap import GapDS, wf_smooth # import custom functions, etc. # from download_gedi import download_gedi -from get_gedi_data import get_gedi_data +# from get_gedi_data import get_gedi_data +from get_gedi_data_fsspec import get_gedi_data + ## GET CWD of file to locate path CWD = os.path.dirname(os.path.abspath(__file__)) @@ -152,7 +154,7 @@ if __name__ == '__main__': if not isinstance(idx, list): idx = [idx] - # run function in parallel + # run function in parallel (not available in MAAP) # pool = mp.Pool(10) # results = pool.map(gedi_bioindex, idx) # results = pool.starmap(gedi_bioindex, zip(idx, repeat(l1b_ds), repeat(l2a_ds))) diff --git a/notebooks/run-gedi-biomass.ipynb b/notebooks/run-gedi-biomass.ipynb index 4c550892c81740819f017b4e8db1da3fcb34cdf2..4b6438d2cb0f076cc7a9ba63da9fb12e09bf5e5e 100644 --- a/notebooks/run-gedi-biomass.ipynb +++ b/notebooks/run-gedi-biomass.ipynb @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "id": "b39e76ac-d765-4b8c-baf1-fe41f3296de9", "metadata": {}, "outputs": [ @@ -63,17 +63,6 @@ "text": [ "on file num: 100\r" ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m counter\u001b[38;5;241m%\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon file num: \u001b[39m\u001b[38;5;124m\"\u001b[39m, counter, end\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\r\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 35\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m800\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 36\u001b[0m counter\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] } ], "source": [ @@ -89,7 +78,7 @@ "# Get matching string pattern and run main.py\n", "jobs_list = []\n", "counter=1\n", - "for l1b_fp in l1b_fpaths[100:]:\n", + "for l1b_fp in l1b_fpaths[200:]:\n", " # Get string pattern\n", " str_pattern = re.findall(\"[0-9]{13}\", os.path.basename(l1b_fp))[0] \n", " try:\n", @@ -111,7 +100,8 @@ " \n", " if counter%100==0:\n", " print(\"on file num: \", counter, end='\\r')\n", - " time.sleep(800)\n", + " break\n", + " time.sleep(1200)\n", " counter+=1" ] }, @@ -133,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "7fe1c6f4-37c5-4fe7-b9ff-51e17b7a5cf0", "metadata": {}, "outputs": [ @@ -141,8 +131,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accepted\n", - "d974c84b-2691-4569-b496-1cc06380ec84\n" + "Succeeded\n", + "62fd5122-a199-4ae7-bbd1-d86e11a04a7e\n" ] } ], @@ -230,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "id": "9ecb7ffc-319f-40c9-8f6e-d18b6ea1c890", "metadata": {}, "outputs": [], @@ -250,17 +240,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "f3ae4e2c-fa90-41ae-8fa2-9e3fd1241157", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "13" + "169" ] }, - "execution_count": 10, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -271,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "id": "5bb204a4-e7d4-4a83-898f-81db03d080aa", "metadata": {}, "outputs": [ @@ -366,55 +356,36 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "b364772a", "metadata": {}, - "outputs": [], - "source": [ - "# Change to appropriate filepaths for urls\n", - "l1b_urls_fpath = \"/projects/biomass-gedi-conus/data/GEDI-L1B-2022-URLS.txt\"\n", - "l2a_urls_fpath = \"/projects/biomass-gedi-conus/data/GEDI-L2A-2022-URLS.txt\"\n", - "\n", - "with open(l1b_urls_fpath) as f:\n", - " l1b_fpaths = f.read().splitlines()\n", - "with open(l2a_urls_fpath) as f:\n", - " l2a_fpaths = f.read().splitlines()\n", - "\n", - "# Get matching string pattern and run main.py\n", - "jobs_list = []\n", - "counter=1\n", - "for l1b_fp in l1b_fpaths[1:]:\n", - " # Get string pattern\n", - " str_pattern = re.findall(\"[0-9]{13}\", os.path.basename(l1b_fp))[0] \n", - " try:\n", - " l2a_fp = [s for s in l2a_fpaths if str_pattern in s][0]\n", - " except:\n", - " print(\"No matching L2 file!\", l1b_fp)\n", - " continue\n", - " break\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "4a6460a7-4d65-4311-901c-e5685daeb2f0", - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "https://e4ftl01.cr.usgs.gov//GEDI_L1_L2/GEDI/GEDI01_B.002/2022.08.31/GEDI01_B_2022243125140_O21055_03_T02667_02_005_02_V002.h5\n", - "https://e4ftl01.cr.usgs.gov//GEDI_L1_L2/GEDI/GEDI02_A.002/2022.08.31/GEDI02_A_2022243125140_O21055_03_T02667_02_003_02_V002.h5\n" + "<KeysViewHDF5 ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']>\n", + "[243.6951 244.02603 244.26328 244.186 243.79663 243.32755 243.1004\n", + " 243.24414 243.63077 243.96375]\n" ] } ], "source": [ - "print(l1b_fp)\n", - "print(l2a_fp)" + "import h5py\n", + "fp = \"../output/GEDI01_B_2022243142430_O21056_03_T06784_02_005_02_V002.h5\"\n", + "with h5py.File(fp, \"r\") as f:\n", + " print(f.keys())\n", + " print(f['BEAM0000'][\"rxwaveform\"][:10])" ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4a6460a7-4d65-4311-901c-e5685daeb2f0", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -426,9 +397,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "osgeo-env", "language": "python", - "name": "python3" + "name": "osgeo-env" }, "language_info": { "codemirror_mode": { @@ -440,7 +411,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.12.0" } }, "nbformat": 4,