{ "cells": [ { "cell_type": "markdown", "id": "1f5b6821-d96b-46ac-bd33-076112a08445", "metadata": {}, "source": [ "# Harvest\n", "\n", "Use meta data from catalog datasets to create a meta data base." ] }, { "cell_type": "code", "execution_count": null, "id": "b94696a9-7092-4f59-b85c-213f4c77cadb", "metadata": {}, "outputs": [], "source": [ "from copy import deepcopy\n", "import intake\n", "import yaml\n", "import pandas as pd\n", "import datetime\n", "import json\n", "import fsspec as fs\n", "from copy import deepcopy" ] }, { "cell_type": "code", "execution_count": null, "id": "954f1ffd-4c0d-4152-9081-5495df6535de", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_sources(catalog,name=None):\n", " newname='.'.join(\n", " [ a \n", " for a in [name, catalog.name]\n", " if a\n", " ]\n", " )\n", " data_sources = []\n", " \n", " for key, entry in catalog.items():\n", " if key==\"csv\" or key==\"esm-json\":\n", " continue\n", " elif isinstance(entry, intake.catalog.Catalog):\n", " if newname == \"main\":\n", " newname = None\n", " # If the entry is a subcatalog, recursively search it\n", " data_sources.extend(get_sources(entry, newname))\n", " elif isinstance(entry, intake.source.base.DataSource):\n", " data_sources.append(newname+\".\"+key)\n", "\n", " return data_sources" ] }, { "cell_type": "code", "execution_count": null, "id": "9bace732-62a4-4598-9387-c44ad7614c02", "metadata": {}, "outputs": [], "source": [ "cat=intake.open_catalog(\"../../main.yaml\")\n", "sources=get_sources(cat)" ] }, { "cell_type": "code", "execution_count": null, "id": "4df349e4-fb76-46ab-bf3c-e75bfaf0b12f", "metadata": {}, "outputs": [], "source": [ "list(set([a.split('.')[1] for a in sources]))" ] }, { "cell_type": "code", "execution_count": null, "id": "476860b4-99d4-4edc-b522-cc2ab45f7e70", "metadata": {}, "outputs": [], "source": [ "drs=\"levelType_dataType_frequency\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7524c991-cbfe-49c0-b8e1-eec925322e1d", "metadata": { "tags": [] }, "outputs": [], "source": [ "catraw=yaml.safe_load(cat.text)" ] }, { "cell_type": "code", "execution_count": null, "id": "fc86fb56-5e7a-4586-81c0-7d68f3989349", "metadata": {}, "outputs": [], "source": [ "def map_drs(olds):\n", " s=olds.split('.')[1]\n", " #description=cat.describe()\n", " description=catraw[\"sources\"][s]\n", " metadata=description[\"metadata\"]\n", " metadata_keys=[\"format\",\"grid_id\",\"member_id\",\"institution_id\",\"institution\",\"references\",\"simulation_id\",\"variables\",\"variable-long_names\"]\n", " filtered={k:metadata[k] for k in metadata_keys if k in metadata.keys()}\n", " mapped=dict()\n", " mapped[\"format\"]=\"netcdf\"\n", " if filtered:\n", " mapped.update(filtered)\n", " if not \"variables\" in filtered:\n", "# if \"user_parameters\" in description:\n", " if \"parameters\" in description:\n", " mapped[\"variables\"]=description[\"parameters\"][\"variables\"][\"allowed\"]\n", "# for paramdict in description[\"user_parameters\"]:\n", "# if paramdict[\"name\"]==\"variables\":\n", "# print(s)\n", "# mapped[\"variables\"]=paramdict[\"allowed\"]\n", " for idx,k in enumerate(drs.split('_')):\n", " mapped[k]=s.split('_')[idx]\n", " if \"aggregation\" in mapped:\n", " if mapped[\"aggregation\"].endswith('0'):\n", " mapped[\"aggregation\"]+=s.split('.')[-1]\n", " urlpath=description[\"args\"][\"urlpath\"]\n", " if type(urlpath)==list:\n", " urlpath=urlpath[0]\n", " if urlpath.startswith(\"reference\"):\n", " mapped[\"format\"]=\"zarr\"\n", " elif urlpath.endswith(\".grib\") :\n", " mapped[\"format\"]=\"grib\"\n", " mapped[\"urlpath\"]=description[\"args\"][\"urlpath\"]\n", " return mapped" ] }, { "cell_type": "code", "execution_count": null, "id": "9a104da1-6e0e-49d9-b6f8-81cdc8d40479", "metadata": { "tags": [] }, "outputs": [], "source": [ "df=pd.DataFrame(list(map(map_drs,sources)))" ] }, { "cell_type": "code", "execution_count": null, "id": "e8d614d9-f20a-4797-8d83-e9aa5e26a146", "metadata": { "tags": [] }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "95a9b288-8d16-462d-9378-b22961b1f555", "metadata": { "tags": [] }, "outputs": [], "source": [ "#df[\"variables\"]=df[\"variables\"].astype(str)\n", "#df[\"variable-long_names\"]=df[\"variable-long_names\"].astype(str)\n", "#df[\"urlpath\"]=df[\"urlpath\"].astype(str)" ] }, { "cell_type": "code", "execution_count": null, "id": "422b61c4-83e0-4518-945e-a811d438bc30", "metadata": { "tags": [] }, "outputs": [], "source": [ "df.to_csv(\"dkrz_era5_disk.csv.gz\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "81fbc3d4-f048-49f6-ac52-9f584d314627", "metadata": {}, "outputs": [], "source": [ "template=json.load(fs.open(\"https://raw.githubusercontent.com/eerie-project/intake_catalogues/main/jasmin/jasmin-catalogue.json\").open())" ] }, { "cell_type": "code", "execution_count": null, "id": "8b26d221-e5a9-4752-a460-01326db6e7e3", "metadata": { "tags": [] }, "outputs": [], "source": [ "descriptive_json=deepcopy(template)" ] }, { "cell_type": "code", "execution_count": null, "id": "fc989b98-4b1c-411f-a3e3-db449113fca0", "metadata": { "tags": [] }, "outputs": [], "source": [ "descriptive_json[\"assets\"]['column_name']='urlpath'\n", "del descriptive_json[\"assets\"]['format']\n", "descriptive_json[\"assets\"]['format_column_name']='format'" ] }, { "cell_type": "code", "execution_count": null, "id": "018e8577-9696-4c65-81e8-e7575a1a6f83", "metadata": { "tags": [] }, "outputs": [], "source": [ "descriptive_json[\"attributes\"]=[\n", " {\n", " 'column_name': a, 'vocabulary': ''\n", " }\n", " for a in df.columns\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "f727e403-9205-4201-a4db-a3beb4bf7f65", "metadata": { "tags": [] }, "outputs": [], "source": [ "descriptive_json[\"aggregation_control\"][\"variable_column_name\"]=\"variables\"\n", "#descriptive_json[\"aggregation_control\"][\"groupby_attrs\"]=drs.split('.')\n", "descriptive_json[\"aggregation_control\"][\"groupby_attrs\"]=drs.split('_')\n", "descriptive_json[\"aggregation_control\"][\"aggregations\"]=[\n", " {\n", " 'type': 'union',\n", " 'attribute_name': 'variables',\n", " 'options': {}\n", " }\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "c2c6cc71-1ca6-4c8d-933f-e0c38e7ba677", "metadata": { "tags": [] }, "outputs": [], "source": [ "descriptive_json[\"id\"]=\"dkrz-catalogue\"\n", "descriptive_json[\"last_updated\"]=datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", "del descriptive_json[\"catalog_file\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "62952336-495c-4346-907e-f058613fc156", "metadata": { "tags": [] }, "outputs": [], "source": [ "#save this in github:\n", "with open(\"dkrz_era5_disk.json\",\"w\") as f:\n", " f.write(json.dumps(\n", " descriptive_json,\n", " sort_keys=True,\n", " indent=4,\n", " separators=(',', ': ')\n", "))" ] }, { "cell_type": "code", "execution_count": null, "id": "0f3e6d4d-4969-441f-b725-4a0e36dfcd8b", "metadata": { "tags": [] }, "outputs": [], "source": [ "esmcat=intake.open_esm_datastore(\n", " obj=dict(\n", " esmcat=descriptive_json,\n", " df=df\n", " ),\n", " columns_with_iterables=[\"variables\",\"variable-long_names\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "71f4b618-026b-4a81-a329-7d457de6482b", "metadata": { "tags": [] }, "outputs": [], "source": [ "esmcat" ] }, { "cell_type": "code", "execution_count": null, "id": "c843b4d7-0d37-464f-9acf-fbd32963e3b9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0e5b4df6-1c0c-47ab-8e8d-44a754941310", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "dkrzcatalog", "language": "python", "name": "dkrzcatalog" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }