From 19851a538774aa727bf1f26869f969a53801bfbd Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Tue, 25 Nov 2025 12:29:23 +0100 Subject: [PATCH 1/8] feat: databus api key for downloading --- databusclient/cli.py | 8 ++++--- databusclient/client.py | 50 ++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 4e97470..3209008 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -94,10 +94,11 @@ def deploy(version_id, title, abstract, description, license_url, apikey, @click.argument("databusuris", nargs=-1, required=True) @click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") @click.option("--databus", help="Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql)") -@click.option("--token", help="Path to Vault refresh token file") +@click.option("--vault-token", help="Path to Vault refresh token file") +@click.option("--databus-key", help="Databus API key to donwload from protected databus") @click.option("--authurl", default="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", show_default=True, help="Keycloak token endpoint URL") @click.option("--clientid", default="vault-token-exchange", show_default=True, help="Client ID for token exchange") -def download(databusuris: List[str], localdir, databus, token, authurl, clientid): +def download(databusuris: List[str], localdir, databus, vault_token, databus_key, authurl, clientid): """ Download datasets from databus, optionally using vault access if vault options are provided. """ @@ -105,7 +106,8 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid localDir=localdir, endpoint=databus, databusURIs=databusuris, - token=token, + token=vault_token, + databus_key=databus_key, auth_url=authurl, client_id=clientid, ) diff --git a/databusclient/client.py b/databusclient/client.py index 358f1a6..1a16586 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -491,7 +491,7 @@ def deploy_from_metadata( print(f" - {entry['url']}") -def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: +def __download_file__(url, filename, vault_token_file=None, databus_key=None, auth_url=None, client_id=None) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -523,7 +523,8 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien response = requests.get(url, stream=True, allow_redirects=False) # no redirects here, we want to see if auth is required www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth - if (response.status_code == 401 or "bearer" in www.lower()): + # Vault token required if 401 Unauthorized with Bearer challenge + if (response.status_code == 401 and "bearer" in www.lower()): print(f"Authentication required for {url}") if not (vault_token_file): raise ValueError("Vault token file not given for protected download") @@ -534,6 +535,15 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien # --- 4. Retry with token --- response = requests.get(url, headers=headers, stream=True) + + # Databus API key required if only 401 Unauthorized + elif response.status_code == 401: + print(f"API key required for {url}") + if not databus_key: + raise ValueError("Databus API key not given for protected download") + + headers = {"X-API-KEY": databus_key} + response = requests.get(url, headers=headers, stream=True) try: response.raise_for_status() # Raise if still failing @@ -554,8 +564,10 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien file.write(data) progress_bar.close() + # TODO: could be a problem of github raw / openflaas if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: - raise IOError("Downloaded size does not match Content-Length header") + # raise IOError("Downloaded size does not match Content-Length header") + print(f"Warning: Downloaded size does not match Content-Length header:\nExpected {total_size_in_bytes}, got {progress_bar.n}") def __get_vault_access__(download_url: str, @@ -702,31 +714,38 @@ def wsha256(raw: str): return sha256(raw.encode('utf-8')).hexdigest() -def __handle_databus_collection__(uri: str) -> str: +def __handle_databus_collection__(uri: str, databus_key: str = None) -> str: headers = {"Accept": "text/sparql"} + if databus_key is not None: + headers["X-API-KEY"] = databus_key + return requests.get(uri, headers=headers).text -def __get_json_ld_from_databus__(uri: str) -> str: +def __get_json_ld_from_databus__(uri: str, databus_key: str = None) -> str: headers = {"Accept": "application/ld+json"} + if databus_key is not None: + headers["X-API-KEY"] = databus_key return requests.get(uri, headers=headers).text def __download_list__(urls: List[str], localDir: str, vault_token_file: str = None, + databus_key: str = None, auth_url: str = None, client_id: str = None) -> None: + fileLocalDir = localDir for url in urls: if localDir is None: host, account, group, artifact, version, file = __get_databus_id_parts__(url) - localDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") - print(f"Local directory not given, using {localDir}") + fileLocalDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") + print(f"Local directory not given, using {fileLocalDir}") file = url.split("/")[-1] - filename = os.path.join(localDir, file) + filename = os.path.join(fileLocalDir, file) print("\n") - __download_file__(url=url, filename=filename, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id) + __download_file__(url=url, filename=filename, vault_token_file=vault_token_file, databus_key=databus_key, auth_url=auth_url, client_id=client_id) print("\n") @@ -742,6 +761,7 @@ def download( endpoint: str, databusURIs: List[str], token=None, + databus_key=None, auth_url=None, client_id=None ) -> None: @@ -771,15 +791,15 @@ def download( if "/collections/" in databusURI: # TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI query = __handle_databus_collection__(databusURI) res = __handle_databus_file_query__(endpoint, query) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus file elif file is not None: - __download_list__([databusURI], localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__([databusURI], localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus artifact version elif version is not None: json_str = __get_json_ld_from_databus__(databusURI) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus artifact elif artifact is not None: json_str = __get_json_ld_from_databus__(databusURI) @@ -787,7 +807,7 @@ def download( print(f"No version given, using latest version: {latest}") json_str = __get_json_ld_from_databus__(latest) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus group elif group is not None: @@ -800,7 +820,7 @@ def download( print(f"No version given, using latest version: {latest}") json_str = __get_json_ld_from_databus__(latest) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus account elif account is not None: @@ -816,4 +836,4 @@ def download( if endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = __handle_databus_file_query__(endpoint, databusURI) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) From c10c114f460f352cf5d831bf70f92454d83a71e3 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Tue, 25 Nov 2025 18:41:00 +0100 Subject: [PATCH 2/8] refactored README.md --- README.md | 364 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 218 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index 0b65641..3870c0d 100644 --- a/README.md +++ b/README.md @@ -1,187 +1,280 @@ -# Databus Client Python +# Databus Python Client -## Quickstart Example -Commands to download the DBpedia Knowledge Graphs generated by Live Fusion. -DBpedia Live Fusion publishes two different kinds of KGs: +Command-line and Python client for downloading and deploying datasets on DBpedia Databus. -1. Open Core Knowledge Graphs under CC-BY-SA license, open with copyleft/share-alike, no registration needed -2. Industry Knowledge Graphs under BUSL 1.1 license, unrestricted for research and experimentation, commercial license for productive use, free registration needed. - -### Registration (Access Token) +## Table of Contents +- [Quickstart](#quickstart) + - [Docker](#docker) + - [Python](#python) +- [DBpedia](#dbpedia) + - [Registration (Access Token)](#registration-access-token) + - [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) + - [Download Live Fusion KG Snapshot (BUSL 1.1, registration needed)](#download-live-fusion-kg-snapshot-busl-11-registration-needed) + - [Download Enriched Knowledge Graphs (BUSL 1.1, registration needed)](#download-enriched-knowledge-graphs-busl-11-registration-needed) + - [Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed)](#download-dbpedia-wikipedia-knowledge-graphs-cc-by-sa-no-registration-needed) + - [Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed)](#download-dbpedia-wikidata-knowledge-graphs-cc-by-sa-no-registration-needed) +- [CLI Usage](#cli-usage) + - [Download](#cli-download) + - [Deploy](#cli-deploy) +- [Module Usage](#module-usage) + - [Deploy](#module-deploy) -1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org -2. Login at https://account.dbpedia.org and create your token. -3. Save the token to a file `vault-token.dat`. -### Docker vs. Python -The databus-python-client comes as **docker** or **python** with these patterns. -`$DOWNLOADTARGET` can be any Databus URI including collections OR SPARQL query (or several thereof). Details are documented below. +# Quickstart + +The client has two main functionalities: downloading datasets from the Databus and deploying datasets to the Databus. The first section covers [downloading DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) using the client, the second section covers the [CLI usages](#cli-usage) of [download](#cli-download) and [deploy](#cli-deploy) commands, and the last section covers [module usage](#module-usage) for deploying datasets programmatically. + +To use the *databus-python-client*, you can choose between **Docker** or **Python** usage. Both methods support all functionalities of the client. The docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). You can find below how to use both methods in the command line. + +## Python + +Requirements: [Python](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/) + +Before using the client, install it via pip: + ```bash -# Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOADTARGET --token vault-token.dat -# Python python3 -m pip install databusclient -databusclient download $DOWNLOADTARGET --token vault-token.dat ``` +You can then use the client in the command line: + +```bash +databusclient --help +databusclient deploy --help +databusclient download --help +``` + +## Docker + +Requirements: [Docker](https://docs.docker.com/get-docker/) + +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client --help +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy --help +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download --help +``` + +# DBpedia + +Commands to download the [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) generated by Live Fusion. DBpedia Live Fusion publishes two different kinds of KGs: + +1. Open Core Knowledge Graphs under CC-BY-SA license, open with copyleft/share-alike, no registration needed +2. Industry Knowledge Graphs under BUSL 1.1 license, unrestricted for research and experimentation, commercial license for productive use, free [registration](#registration-access-token) needed. + + +## Registration (Access Token) + +To download BUSL 1.1 licensed datasets, you need to register and get an access token. + +1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org +2. Login at https://account.dbpedia.org and create your token +3. Save the token to a file, e.g. `vault-token.dat` + +## DBpedia Knowledge Graphs + ### Download Live Fusion KG Snapshot (BUSL 1.1, registration needed) -TODO One slogan sentence. [More information](https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-snapshot) +High-frequency, conflict-resolved knowledge graph that merges Live Wikipedia and Wikidata signals into a single, queryable snapshot for enterprise consumption. [More information](https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot) ```bash -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-snapshot --token vault-token.dat +# Python +databusclient download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot --vault-token vault-token.dat +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot --vault-token vault-token.dat ``` ### Download Enriched Knowledge Graphs (BUSL 1.1, registration needed) + **DBpedia Wikipedia Extraction Enriched** -TODO One slogan sentence and link -Currently EN DBpedia only. -```bash -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikipedia-kg-enriched-snapshot --token vault-token.dat -``` -**DBpedia Wikidata Extraction Enriched** -TODO One slogan sentence and link +DBpedia-based enrichment of structured Wikipedia extractions. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot) + +Currently EN DBpedia only. ```bash -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikidata-kg-enriched-snapshot --token vault-token.dat +# Python +databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot --vault-token vault-token.dat +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot --vault-token vault-token.dat ``` ### Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed) -TODO One slogan sentence and link + +Original extraction of structured Wikipedia data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot) ```bash -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikipedia-kg-snapshot +# Python +databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot ``` ### Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed) -TODO One slogan sentence and link + +Original extraction of structured Wikidata data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot) ```bash -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikidata-kg-snapshot +# Python +databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot ``` -## Docker Image Usage - -A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). See [download section](#usage-of-docker-image) for details. - +# CLI Usage -## CLI Usage +To get started with the command-line interface (CLI) of the databus-python-client, you can use either the Python installation or the Docker image. Below are always examples for both methods. -**Installation** -```bash -python3 -m pip install databusclient -``` +**Help and further general information:** -**Running** ```bash +# Python databusclient --help -``` +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client --help -```man +# Output: Usage: databusclient [OPTIONS] COMMAND [ARGS]... + Databus Client CLI + Options: - --install-completion [bash|zsh|fish|powershell|pwsh] - Install completion for the specified shell. - --show-completion [bash|zsh|fish|powershell|pwsh] - Show completion for the specified shell, to - copy it or customize the installation. - --help Show this message and exit. + --help Show this message and exit. Commands: - deploy - download + deploy Flexible deploy to Databus command supporting three modes: + download Download datasets from databus, optionally using vault access... ``` + +## Download +With the download command, you can download datasets or parts thereof from the Databus. The download command expects one or more Databus URIs or a SPARQL query as arguments. The URIs can point to files, version, artifacts, groups, or collections. If a SPARQL query is provided, the query must return download URLs from the Databus which will be downloaded. -### Download command +```bash +# Python +databusclient download $DOWNLOADTARGET +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOADTARGET ``` + +- `$DOWNLOADTARGET` + - Can be any Databus URI including collections OR SPARQL query (or several thereof). +- `--localdir` + - If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the databus structure, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. +- `--vault-token` + - If the dataset/files to be downloaded require vault authentication, you need to provide a vault token with `--vault-token /path/to/vault-token.dat`. See [Registration (Access Token)](#registration-access-token) for details on how to get a vault token. +- `--databus-key` + - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. + +**Help and further information on download command:** +```bash +# Python databusclient download --help -``` +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download --help -``` +# Output: Usage: databusclient download [OPTIONS] DATABUSURIS... -Arguments: - DATABUSURIS... databus uris to download from https://databus.dbpedia.org, - or a query statement that returns databus uris from https://databus.dbpedia.org/sparql - to be downloaded [required] - Download datasets from databus, optionally using vault access if vault options are provided. Options: - --localdir TEXT Local databus folder (if not given, databus folder - structure is created in current working directory) - --databus TEXT Databus URL (if not given, inferred from databusuri, e.g. - https://databus.dbpedia.org/sparql) - --token TEXT Path to Vault refresh token file - --authurl TEXT Keycloak token endpoint URL [default: - https://auth.dbpedia.org/realms/dbpedia/protocol/openid- - connect/token] - --clientid TEXT Client ID for token exchange [default: vault-token- - exchange] - --help Show this message and exit. Show this message and exit. + --localdir TEXT Local databus folder (if not given, databus folder + structure is created in current working directory) + --databus TEXT Databus URL (if not given, inferred from databusuri, + e.g. https://databus.dbpedia.org/sparql) + --vault-token TEXT Path to Vault refresh token file + --databus-key TEXT Databus API key to donwload from protected databus + --authurl TEXT Keycloak token endpoint URL [default: + https://auth.dbpedia.org/realms/dbpedia/protocol/openid- + connect/token] + --clientid TEXT Client ID for token exchange [default: vault-token- + exchange] + --help Show this message and exit. ``` -Examples of using download command +### Examples of using download command -**File**: download of a single file -``` +**Download File**: download of a single file +```bash +# Python databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2.ttl.bz2 ``` -**Version**: download of all files of a specific version -``` +**Download Version**: download of all files of a specific version +```bash +# Python databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 ``` -**Artifact**: download of all files with latest version of an artifact -``` +**Download Artifact**: download of all files with latest version of an artifact +```bash +# Python databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals ``` -**Group**: download of all files with lates version of all artifacts of a group -``` +**Download Group**: download of all files with lates version of all artifacts of a group +```bash +# Python databusclient download https://databus.dbpedia.org/dbpedia/mappings +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings ``` -If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the databus structure, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. - -**Collection**: download of all files within a collection -``` +**Download Collection**: download of all files within a collection +```bash +# Python databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 ``` -**Query**: download of all files returned by a query (sparql endpoint must be provided with `--databus`) -``` +**Download Query**: download of all files returned by a query (sparql endpoint must be provided with `--databus`) +```bash +# Python databusclient download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` -### Deploy command + +## Deploy + +With the deploy command, you can deploy datasets to the Databus. The deploy command supports three modes: +1. Classic dataset deployment via list of distributions +2. Metadata-based deployment via metadata JSON file +3. Upload & deploy via Nextcloud/WebDAV + +```bash +# Python +databusclient deploy [OPTIONS] [DISTRIBUTIONS]... +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy [OPTIONS] [DISTRIBUTIONS]... ``` + +**Help and further information on deploy command:** +```bash +# Python databusclient deploy --help -``` -``` +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy --help + +# Output: Usage: databusclient deploy [OPTIONS] [DISTRIBUTIONS]... - Flexible deploy to databus command: + Flexible deploy to Databus command supporting three modes: - - Classic dataset deployment + - Classic deploy (distributions as arguments) - - Metadata-based deployment + - Metadata-based deploy (--metadata ) - - Upload & deploy via Nextcloud + - Upload & deploy via Nextcloud (--webdav-url, --remote, --path) -Arguments: - DISTRIBUTIONS... Depending on mode: - - Classic mode: List of distributions in the form - URL|CV|fileext|compression|sha256sum:contentlength - (where URL is the download URL and CV the key=value pairs, - separated by underscores) - - Upload mode: List of local file or folder paths (must exist) - - Metdata mode: None - Options: --version-id TEXT Target databus version/dataset identifier of the form +## Deploy -## Module Usage ### Step 1: Create lists of distributions for the dataset ```python From c62966b782d1db07d7fb728107a7ecc97827e751 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Tue, 25 Nov 2025 19:16:31 +0100 Subject: [PATCH 3/8] refactored README.md --- README.md | 138 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 3870c0d..5afcaa3 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ Command-line and Python client for downloading and deploying datasets on DBpedia ## Table of Contents - [Quickstart](#quickstart) - - [Docker](#docker) - [Python](#python) + - [Docker](#docker) - [DBpedia](#dbpedia) - [Registration (Access Token)](#registration-access-token) - [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) @@ -21,13 +21,13 @@ Command-line and Python client for downloading and deploying datasets on DBpedia - [Deploy](#module-deploy) -# Quickstart +## Quickstart -The client has two main functionalities: downloading datasets from the Databus and deploying datasets to the Databus. The first section covers [downloading DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) using the client, the second section covers the [CLI usages](#cli-usage) of [download](#cli-download) and [deploy](#cli-deploy) commands, and the last section covers [module usage](#module-usage) for deploying datasets programmatically. +The client supports two main workflows: downloading datasets from the Databus and deploying datasets to the Databus. Below you can choose how to run it (Python or Docker), then follow the sections on [DBpedia downloads](#dbpedia-knowledge-graphs), [CLI usage](#cli-usage), or [module usage](#module-usage). -To use the *databus-python-client*, you can choose between **Docker** or **Python** usage. Both methods support all functionalities of the client. The docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). You can find below how to use both methods in the command line. +You can use either **Python** or **Docker**. Both methods support all client features. The Docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). -## Python +### Python Requirements: [Python](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/) @@ -45,7 +45,7 @@ databusclient deploy --help databusclient download --help ``` -## Docker +### Docker Requirements: [Docker](https://docs.docker.com/get-docker/) @@ -55,25 +55,24 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy --help docker run --rm -v $(pwd):/data dbpedia/databus-python-client download --help ``` -# DBpedia +## DBpedia -Commands to download the [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) generated by Live Fusion. DBpedia Live Fusion publishes two different kinds of KGs: +Commands to download the [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) generated by Live Fusion. DBpedia Live Fusion publishes two kinds of KGs: -1. Open Core Knowledge Graphs under CC-BY-SA license, open with copyleft/share-alike, no registration needed -2. Industry Knowledge Graphs under BUSL 1.1 license, unrestricted for research and experimentation, commercial license for productive use, free [registration](#registration-access-token) needed. - +1. Open Core Knowledge Graphs under CC-BY-SA license, open with copyleft/share-alike, no registration needed. +2. Industry Knowledge Graphs under BUSL 1.1 license, unrestricted for research and experimentation, commercial license for productive use, free [registration](#registration-access-token) needed. -## Registration (Access Token) +### Registration (Access Token) To download BUSL 1.1 licensed datasets, you need to register and get an access token. -1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org -2. Login at https://account.dbpedia.org and create your token -3. Save the token to a file, e.g. `vault-token.dat` +1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org +2. Log in at https://account.dbpedia.org and create your token. +3. Save the token to a file, e.g. `vault-token.dat`. -## DBpedia Knowledge Graphs +### DBpedia Knowledge Graphs -### Download Live Fusion KG Snapshot (BUSL 1.1, registration needed) +#### Download Live Fusion KG Snapshot (BUSL 1.1, registration needed) High-frequency, conflict-resolved knowledge graph that merges Live Wikipedia and Wikidata signals into a single, queryable snapshot for enterprise consumption. [More information](https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot) ```bash # Python @@ -82,13 +81,11 @@ databusclient download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-sn docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot --vault-token vault-token.dat ``` -### Download Enriched Knowledge Graphs (BUSL 1.1, registration needed) +#### Download Enriched Knowledge Graphs (BUSL 1.1, registration needed) **DBpedia Wikipedia Extraction Enriched** -DBpedia-based enrichment of structured Wikipedia extractions. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot) - -Currently EN DBpedia only. +DBpedia-based enrichment of structured Wikipedia extractions (currently EN DBpedia only). [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot) ```bash # Python @@ -97,30 +94,31 @@ databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot --vault-token vault-token.dat ``` -### Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed) +#### Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed) Original extraction of structured Wikipedia data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot +databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot ``` -### Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed) + +#### Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed) Original extraction of structured Wikidata data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot +databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot # Docker docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot ``` -# CLI Usage +## CLI Usage -To get started with the command-line interface (CLI) of the databus-python-client, you can use either the Python installation or the Docker image. Below are always examples for both methods. +To get started with the command-line interface (CLI) of the databus-python-client, you can use either the Python installation or the Docker image. The examples below show both methods. **Help and further general information:** @@ -144,9 +142,9 @@ Commands: ``` -## Download +### Download -With the download command, you can download datasets or parts thereof from the Databus. The download command expects one or more Databus URIs or a SPARQL query as arguments. The URIs can point to files, version, artifacts, groups, or collections. If a SPARQL query is provided, the query must return download URLs from the Databus which will be downloaded. +With the download command, you can download datasets or parts thereof from the Databus. The download command expects one or more Databus URIs or a SPARQL query as arguments. The URIs can point to files, versions, artifacts, groups, or collections. If a SPARQL query is provided, the query must return download URLs from the Databus which will be downloaded. ```bash # Python @@ -158,7 +156,7 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - `$DOWNLOADTARGET` - Can be any Databus URI including collections OR SPARQL query (or several thereof). - `--localdir` - - If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the databus structure, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. + - If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the Databus layout, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. - `--vault-token` - If the dataset/files to be downloaded require vault authentication, you need to provide a vault token with `--vault-token /path/to/vault-token.dat`. See [Registration (Access Token)](#registration-access-token) for details on how to get a vault token. - `--databus-key` @@ -192,14 +190,14 @@ Options: --help Show this message and exit. ``` -### Examples of using download command +### Examples of using the download command **Download File**: download of a single file ```bash # Python databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2.ttl.bz2 +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 ``` **Download Version**: download of all files of a specific version @@ -210,15 +208,15 @@ databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 ``` -**Download Artifact**: download of all files with latest version of an artifact +**Download Artifact**: download of all files with the latest version of an artifact ```bash # Python databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals # Docker docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals -``` +``` -**Download Group**: download of all files with lates version of all artifacts of a group +**Download Group**: download of all files with the latest version of all artifacts of a group ```bash # Python databusclient download https://databus.dbpedia.org/dbpedia/mappings @@ -234,7 +232,7 @@ databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-s docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 ``` -**Download Query**: download of all files returned by a query (sparql endpoint must be provided with `--databus`) +**Download Query**: download of all files returned by a query (SPARQL endpoint must be provided with `--databus`) ```bash # Python databusclient download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql @@ -243,7 +241,7 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX d ``` -## Deploy +### Deploy With the deploy command, you can deploy datasets to the Databus. The deploy command supports three modes: 1. Classic dataset deployment via list of distributions @@ -295,6 +293,7 @@ Options: ### Mode 1: Classic Deploy (Distributions) ```bash +# Python databusclient deploy \ --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 \ --title "Client Testing" \ @@ -302,7 +301,16 @@ databusclient deploy \ --description "Testing the client...." \ --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 \ --apikey MYSTERIOUS \ -'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ +--version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 \ +--title "Client Testing" \ +--abstract "Testing the client...." \ +--description "Testing the client...." \ +--license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 \ +--apikey MYSTERIOUS \ +'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger ``` A few more notes for CLI usage: @@ -317,6 +325,7 @@ Use a JSON metadata file to define all distributions. The metadata.json should list all distributions and their metadata. All files referenced there will be registered on the Databus. ```bash +# Python databusclient deploy \ --metadata ./metadata.json \ --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ @@ -325,6 +334,15 @@ databusclient deploy \ --description "This dataset was uploaded using metadata.json." \ --license https://dalicc.net/licenselibrary/Apache-2.0 \ --apikey "API-KEY" +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ + --metadata ./metadata.json \ + --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ + --title "Metadata Deploy Example" \ + --abstract "This is a short abstract of the dataset." \ + --description "This dataset was uploaded using metadata.json." \ + --license https://dalicc.net/licenselibrary/Apache-2.0 \ + --apikey "API-KEY" ``` Example `metadata.json` metadata file structure (`file_format` and `compression` are optional): ```json @@ -350,11 +368,25 @@ Example `metadata.json` metadata file structure (`file_format` and `compression` Upload local files or folders to a WebDAV/Nextcloud instance and automatically deploy to DBpedia Databus. [Rclone](https://rclone.org/) is required. ```bash +# Python databusclient deploy \ --webdav-url https://cloud.example.com/remote.php/webdav \ --remote nextcloud \ --path datasets/mydataset \ - --version-id https://databus.org/user/dataset/version/1.0 \ + --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ + --title "Test Dataset" \ + --abstract "Short abstract of dataset" \ + --description "This dataset was uploaded for testing the Nextcloud → Databus pipeline." \ + --license https://dalicc.net/licenselibrary/Apache-2.0 \ + --apikey "API-KEY" \ + ./localfile1.ttl \ + ./data_folder +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ + --webdav-url https://cloud.example.com/remote.php/webdav \ + --remote nextcloud \ + --path datasets/mydataset \ + --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ --title "Test Dataset" \ --abstract "Short abstract of dataset" \ --description "This dataset was uploaded for testing the Nextcloud → Databus pipeline." \ @@ -364,12 +396,12 @@ databusclient deploy \ ./data_folder ``` -# Module Usage +## Module Usage -## Deploy +### Deploy -### Step 1: Create lists of distributions for the dataset +#### Step 1: Create lists of distributions for the dataset ```python from databusclient import create_distribution @@ -388,10 +420,10 @@ distributions.append( # will just place parameters correctly, nothing will be downloaded or inferred distributions.append( create_distribution( - url="https://example.org/some/random/file.csv.bz2", - cvs={"type": "example", "realfile": "false"}, - file_format="csv", - compression="bz2", + url="https://example.org/some/random/file.csv.bz2", + cvs={"type": "example", "realfile": "false"}, + file_format="csv", + compression="bz2", sha256_length_tuple=("7a751b6dd5eb8d73d97793c3c564c71ab7b565fa4ba619e4a8fd05a6f80ff653", 367116) ) ) @@ -402,7 +434,7 @@ A few notes: * The dict for content variants can be empty ONLY IF there is just one distribution * There can be no compression if there is no file format -### Step 2: Create dataset +#### Step 2: Create dataset ```python from databusclient import create_dataset @@ -431,14 +463,14 @@ dataset = create_dataset( ) ``` -NOTE: To be used you need to set all group parameters, or it will be ignored +NOTE: Group metadata is applied only if all group parameters are set. -### Step 3: Deploy to databus +#### Step 3: Deploy to Databus ```python from databusclient import deploy -# to deploy something you just need the dataset from the previous step and an APIO key +# to deploy something you just need the dataset from the previous step and an API key # API key can be found (or generated) at https://$$DATABUS_BASE$$/$$USER$$#settings -deploy(dataset, "mysterious api key") -``` \ No newline at end of file +deploy(dataset, "mysterious API key") +``` From 833872d78ab3be6d7515446ae19a0753de9bdf43 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Tue, 2 Dec 2025 16:01:49 +0100 Subject: [PATCH 4/8] chore: readme examples - dev to prod databus --- README.md | 24 ++++++++++++------------ databusclient/client.py | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 5afcaa3..293c8dc 100644 --- a/README.md +++ b/README.md @@ -73,47 +73,47 @@ To download BUSL 1.1 licensed datasets, you need to register and get an access t ### DBpedia Knowledge Graphs #### Download Live Fusion KG Snapshot (BUSL 1.1, registration needed) -High-frequency, conflict-resolved knowledge graph that merges Live Wikipedia and Wikidata signals into a single, queryable snapshot for enterprise consumption. [More information](https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot) +High-frequency, conflict-resolved knowledge graph that merges Live Wikipedia and Wikidata signals into a single, queryable snapshot for enterprise consumption. [More information](https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-dump) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot --vault-token vault-token.dat +databusclient download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-dump --vault-token vault-token.dat # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/live-fusion-kg-snapshot --vault-token vault-token.dat +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-dump --vault-token vault-token.dat ``` #### Download Enriched Knowledge Graphs (BUSL 1.1, registration needed) **DBpedia Wikipedia Extraction Enriched** -DBpedia-based enrichment of structured Wikipedia extractions (currently EN DBpedia only). [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot) +DBpedia-based enrichment of structured Wikipedia extractions (currently EN DBpedia only). [More information](https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikipedia-kg-enriched-dump) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot --vault-token vault-token.dat +databusclient download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikipedia-kg-enriched-dump --vault-token vault-token.dat # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-enriched-snapshot --vault-token vault-token.dat +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikipedia-kg-enriched-dump --vault-token vault-token.dat ``` #### Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed) -Original extraction of structured Wikipedia data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot) +Original extraction of structured Wikipedia data before enrichment. [More information](https://databus.dbpedia.org/dbpedia/dbpedia-wikipedia-kg-dump) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot +databusclient download https://databus.dbpedia.org/dbpedia/dbpedia-wikipedia-kg-dump # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikipedia-kg-snapshot +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikipedia-kg-dump ``` #### Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed) -Original extraction of structured Wikidata data before enrichment. [More information](https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot) +Original extraction of structured Wikidata data before enrichment. [More information](https://databus.dbpedia.org/dbpedia/dbpedia-wikidata-kg-dump) ```bash # Python -databusclient download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot +databusclient download https://databus.dbpedia.org/dbpedia/dbpedia-wikidata-kg-dump # Docker -docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dev.dbpedia.link/fhofer/dbpedia-wikidata-kg-snapshot +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikidata-kg-dump ``` ## CLI Usage diff --git a/databusclient/client.py b/databusclient/client.py index 1a16586..8138a84 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -520,7 +520,7 @@ def __download_file__(url, filename, vault_token_file=None, databus_key=None, au print("Redirects url: ", url) # --- 2. Try direct GET --- - response = requests.get(url, stream=True, allow_redirects=False) # no redirects here, we want to see if auth is required + response = requests.get(url, stream=True, allow_redirects=True) www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth # Vault token required if 401 Unauthorized with Bearer challenge From cb917758cdc6518e515b1b02c6b91ae9427df6a6 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Wed, 3 Dec 2025 09:53:40 +0100 Subject: [PATCH 5/8] feat: cli delete to delete datasets from databus --- README.md | 79 +++++++++++++++ databusclient/api/delete.py | 188 ++++++++++++++++++++++++++++++++++++ databusclient/api/utils.py | 37 +++++++ databusclient/cli.py | 21 ++++ databusclient/client.py | 38 +++----- 5 files changed, 338 insertions(+), 25 deletions(-) create mode 100644 databusclient/api/delete.py create mode 100644 databusclient/api/utils.py diff --git a/README.md b/README.md index 293c8dc..41a2213 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Command-line and Python client for downloading and deploying datasets on DBpedia - [CLI Usage](#cli-usage) - [Download](#cli-download) - [Deploy](#cli-deploy) + - [Delete](#cli-delete) - [Module Usage](#module-usage) - [Deploy](#module-deploy) @@ -396,6 +397,84 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ ./data_folder ``` + +### Delete + +With the delete command you can delete collections, groups, artifacts, and versions from the Databus. Deleting files is not supported via API. + +**Note**: Deleting datasets will recursively delete all data associated with the dataset below the specified level. Please use this command with caution. As security measure, the delete command will prompt you for confirmation before proceeding with any deletion. + +```bash +# Python +databusclient delete [OPTIONS] DATABUSURIS... +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete [OPTIONS] DATABUSURIS... +``` + +**Help and further information on delete command:** +```bash +# Python +databusclient delete --help +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete --help + +# Output: +Usage: databusclient delete [OPTIONS] DATABUSURIS... + + Delete a dataset from the databus. + + Delete a group, artifact, or version identified by the given databus URI. + Will recursively delete all data associated with the dataset. + +Options: + --databus-key TEXT Databus API key to access protected databus [required] + --dry-run Perform a dry run without actual deletion + --force Force deletion without confirmation prompt + --help Show this message and exit. +``` + +To authenticate the delete request, you need to provide an API key with `--databus-key YOUR_API_KEY`. + +If you want to perform a dry run without actual deletion, use the `--dry-run` option. This will show you what would be deleted without making any changes. + +As securety measure, the delete command will prompt you for confirmation before proceeding with the deletion. If you want to skip this prompt, you can use the `--force` option. + +**Example of using the delete command** + +### Examples of using the download command + +**Delete Version**: delete a specific version +```bash +# Python +databusclient delete https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --databus-key YOUR_API_KEY +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --databus-key YOUR_API_KEY +``` + +**Delete Artifact**: delete an artifact and all its versions +```bash +# Python +databusclient delete https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --databus-key YOUR_API_KEY +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --databus-key YOUR_API_KEY +``` + +**Delete Group**: delete a group and all its artifacts and versions +```bash +# Python +databusclient delete https://databus.dbpedia.org/dbpedia/mappings --databus-key YOUR_API_KEY +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete https://databus.dbpedia.org/dbpedia/mappings --databus-key YOUR_API_KEY +``` + +**Delete Collection**: delete collection +```bash +# Python +databusclient delete https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --databus-key YOUR_API_KEY +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --databus-key YOUR_API_KEY +``` + ## Module Usage diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py new file mode 100644 index 0000000..fcbaf08 --- /dev/null +++ b/databusclient/api/delete.py @@ -0,0 +1,188 @@ +import json +import requests +from typing import List + +from databusclient.api.utils import get_databus_id_parts_from_uri, get_json_ld_from_databus + +def _confirm_delete(databusURI: str) -> str: + """ + Confirm deletion of a Databus resource with the user. + + Parameters: + - databusURI: The full databus URI of the resource to delete + + Returns: + - "confirm" if the user confirms deletion + - "skip" if the user chooses to skip deletion + - "cancel" if the user chooses to cancel the entire deletion process + """ + print(f"Are you sure you want to delete: {databusURI}?") + print("\nThis action is irreversible and will permanently remove the resource and all its data.") + while True: + choice = input("Type 'yes'/'y' to confirm, 'skip'/'s' to skip this resource, or 'cancel'/'c' to abort: ").strip().lower() + if choice == "yes" or choice == "y": + return "confirm" + elif choice == "skip" or choice == "s": + return "skip" + elif choice == "cancel" or choice == "c": + return "cancel" + else: + print("Invalid input. Please type 'yes'/'y', 'skip'/'s', or 'cancel'/'c'.") + + +def _delete_resource(databusURI: str, databus_key: str, dry_run: bool = False, force: bool = False): + """ + Delete a single Databus resource (version, artifact, group). + + Equivalent to: + curl -X DELETE "" -H "accept: */*" -H "X-API-KEY: " + + Parameters: + - databusURI: The full databus URI of the resource to delete + - databus_key: Databus API key to authenticate the deletion request + - dry_run: If True, do not perform the deletion but only print what would be deleted + - force: If True, skip confirmation prompt and proceed with deletion + """ + + # Confirm the deletion request, skip the request or cancel deletion process + if not (dry_run or force): + action = _confirm_delete(databusURI) + if action == "skip": + print(f"Skipping: {databusURI}\n") + return + if action == "cancel": + raise KeyboardInterrupt("Deletion cancelled by user.") + + if databus_key is None: + raise ValueError("Databus API key must be provided for deletion") + + headers = { + "accept": "*/*", + "X-API-KEY": databus_key + } + + if dry_run: + print(f"[DRY RUN] Would delete: {databusURI}") + return + + response = requests.delete(databusURI, headers=headers) + + if response.status_code in (200, 204): + print(f"Successfully deleted: {databusURI}") + else: + raise Exception(f"Failed to delete {databusURI}: {response.status_code} - {response.text}") + + +def _delete_list(databusURIs: List[str], databus_key: str, dry_run: bool = False, force: bool = False): + """ + Delete a list of Databus resources. + + Parameters: + - databusURIs: List of full databus URIs of the resources to delete + - databus_key: Databus API key to authenticate the deletion requests + """ + for databusURI in databusURIs: + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) + + +def _delete_artifact(databusURI: str, databus_key: str, dry_run: bool = False, force: bool = False): + """ + Delete an artifact and all its versions. + + This function first retrieves all versions of the artifact and then deletes them one by one. + Finally, it deletes the artifact itself. + + Parameters: + - databusURI: The full databus URI of the artifact to delete + - databus_key: Databus API key to authenticate the deletion requests + - dry_run: If True, do not perform the deletion but only print what would be deleted + """ + artifact_body = get_json_ld_from_databus(databusURI, databus_key) + + json_dict = json.loads(artifact_body) + versions = json_dict.get("databus:hasVersion") + + # Single version case {} + if isinstance(versions, dict): + versions = [versions] + # Multiple versions case [{}, {}] + + version_uris = [v["@id"] for v in versions if "@id" in v] + if not version_uris: + raise ValueError("No versions found in artifact JSON-LD") + + # Delete all versions + _delete_list(version_uris, databus_key, dry_run=dry_run) + + # Finally, delete the artifact itself + _delete_resource(databusURI, databus_key, dry_run=dry_run) + + +def _delete_group(databusURI: str, databus_key: str, dry_run: bool = False, force: bool = False): + """ + Delete a group and all its artifacts and versions. + + This function first retrieves all artifacts of the group, then deletes each artifact (which in turn deletes its versions). + Finally, it deletes the group itself. + + Parameters: + - databusURI: The full databus URI of the group to delete + - databus_key: Databus API key to authenticate the deletion requests + - dry_run: If True, do not perform the deletion but only print what would be deleted + """ + group_body = get_json_ld_from_databus(databusURI, databus_key) + + json_dict = json.loads(group_body) + artifacts = json_dict.get("databus:hasArtifact", []) + + artifact_uris = [] + for item in artifacts: + uri = item.get("@id") + if not uri: + continue + _, _, _, _, version, _ = get_databus_id_parts_from_uri(uri) + if version is None: + artifact_uris.append(uri) + + # Delete all artifacts (which deletes their versions) + for artifact_uri in artifact_uris: + _delete_artifact(artifact_uri, databus_key, dry_run=dry_run, force=force) + + # Finally, delete the group itself + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) + +# TODO: add to README.md +def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool): + """ + Delete a dataset from the databus. + + Delete a group, artifact, or version identified by the given databus URI. + Will recursively delete all data associated with the dataset. + + Parameters: + - databusURIs: List of full databus URIs of the resources to delete + - databus_key: Databus API key to authenticate the deletion requests + - dry_run: If True, will only print what would be deleted without performing actual deletions + - force: If True, skip confirmation prompt and proceed with deletion + """ + + for databusURI in databusURIs: + host, account, group, artifact, version, file = get_databus_id_parts_from_uri(databusURI) + + if group == "collections" and artifact is not None: + print(f"Deleting collection: {databusURI}") + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) + elif file is not None: + print(f"Deleting file is not supported via API: {databusURI}") + continue # skip file deletions + elif version is not None: + print(f"Deleting version: {databusURI}") + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) + elif artifact is not None: + print(f"Deleting artifact and all its versions: {databusURI}") + _delete_artifact(databusURI, databus_key, dry_run=dry_run, force=force) + elif group is not None and group != "collections": + print(f"Deleting group and all its artifacts and versions: {databusURI}") + _delete_group(databusURI, databus_key, dry_run=dry_run, force=force) + else: + print(f"Deleting ${databusURI} is not supported.") diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py new file mode 100644 index 0000000..a5646b2 --- /dev/null +++ b/databusclient/api/utils.py @@ -0,0 +1,37 @@ +import requests +from typing import Tuple, Optional + +def get_databus_id_parts_from_uri(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: + """ + Extract databus ID parts from a given databus URI. + + Parameters: + - uri: The full databus URI + + Returns: + A tuple containing (host, accountId, groupId, artifactId, versionId, fileId). + Each element is a string or None if not present. + """ + uri = uri.removeprefix("https://").removeprefix("http://") + parts = uri.strip("/").split("/") + parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts + return tuple(parts[:6]) # return only the first 6 parts + +def get_json_ld_from_databus(uri: str, databus_key: str = None) -> str: + """ + Retrieve JSON-LD representation of a databus resource. + + Parameters: + - uri: The full databus URI + - databus_key: Optional Databus API key for authentication on protected resources + + Returns: + JSON-LD string representation of the databus resource. + """ + headers = {"Accept": "application/ld+json"} + if databus_key is not None: + headers["X-API-KEY"] = databus_key + response = requests.get(uri, headers=headers) + response.raise_for_status() + + return response.text diff --git a/databusclient/cli.py b/databusclient/cli.py index 3209008..aa0551b 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -7,6 +7,7 @@ from databusclient import client from databusclient.rclone_wrapper import upload +from databusclient.api.delete import delete as api_delete @click.group() def app(): @@ -112,6 +113,26 @@ def download(databusuris: List[str], localdir, databus, vault_token, databus_key client_id=clientid, ) +@app.command() +@click.argument("databusuris", nargs=-1, required=True) +@click.option("--databus-key", help="Databus API key to access protected databus", required=True) +@click.option("--dry-run", is_flag=True, help="Perform a dry run without actual deletion") +@click.option("--force", is_flag=True, help="Force deletion without confirmation prompt") +def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool): + """ + Delete a dataset from the databus. + + Delete a group, artifact, or version identified by the given databus URI. + Will recursively delete all data associated with the dataset. + """ + + api_delete( + databusURIs=databusuris, + databus_key=databus_key, + dry_run=dry_run, + force=force, + ) + if __name__ == "__main__": app() diff --git a/databusclient/client.py b/databusclient/client.py index 8138a84..25e3f8d 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -8,6 +8,8 @@ from hashlib import sha256 import os +from databusclient.api.utils import get_databus_id_parts_from_uri, get_json_ld_from_databus + __debug = False @@ -704,7 +706,7 @@ def __get_databus_artifacts_of_group__(json_str: str) -> List[str]: uri = item.get("@id") if not uri: continue - _, _, _, _, version, _ = __get_databus_id_parts__(uri) + _, _, _, _, version, _ = get_databus_id_parts_from_uri(uri) if version is None: result.append(uri) return result @@ -722,13 +724,6 @@ def __handle_databus_collection__(uri: str, databus_key: str = None) -> str: return requests.get(uri, headers=headers).text -def __get_json_ld_from_databus__(uri: str, databus_key: str = None) -> str: - headers = {"Accept": "application/ld+json"} - if databus_key is not None: - headers["X-API-KEY"] = databus_key - return requests.get(uri, headers=headers).text - - def __download_list__(urls: List[str], localDir: str, vault_token_file: str = None, @@ -738,7 +733,7 @@ def __download_list__(urls: List[str], fileLocalDir = localDir for url in urls: if localDir is None: - host, account, group, artifact, version, file = __get_databus_id_parts__(url) + host, account, group, artifact, version, file = get_databus_id_parts_from_uri(url) fileLocalDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") print(f"Local directory not given, using {fileLocalDir}") @@ -749,13 +744,6 @@ def __download_list__(urls: List[str], print("\n") -def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: - uri = uri.removeprefix("https://").removeprefix("http://") - parts = uri.strip("/").split("/") - parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts - return tuple(parts[:6]) # return only the first 6 parts - - def download( localDir: str, endpoint: str, @@ -778,7 +766,7 @@ def download( # TODO: make pretty for databusURI in databusURIs: - host, account, group, artifact, version, file = __get_databus_id_parts__(databusURI) + host, account, group, artifact, version, file = get_databus_id_parts_from_uri(databusURI) # dataID or databus collection if databusURI.startswith("http://") or databusURI.startswith("https://"): @@ -788,8 +776,8 @@ def download( print(f"SPARQL endpoint {endpoint}") # databus collection - if "/collections/" in databusURI: # TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI - query = __handle_databus_collection__(databusURI) + if group == "collections": + query = __handle_databus_collection__(databusURI, databus_key=databus_key) res = __handle_databus_file_query__(endpoint, query) __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus file @@ -797,28 +785,28 @@ def download( __download_list__([databusURI], localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus artifact version elif version is not None: - json_str = __get_json_ld_from_databus__(databusURI) + json_str = get_json_ld_from_databus(databusURI, databus_key=databus_key) res = __handle_databus_artifact_version__(json_str) __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus artifact elif artifact is not None: - json_str = __get_json_ld_from_databus__(databusURI) + json_str = get_json_ld_from_databus(databusURI, databus_key=databus_key) latest = __get_databus_latest_version_of_artifact__(json_str) print(f"No version given, using latest version: {latest}") - json_str = __get_json_ld_from_databus__(latest) + json_str = get_json_ld_from_databus(latest, databus_key=databus_key) res = __handle_databus_artifact_version__(json_str) __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) # databus group elif group is not None: - json_str = __get_json_ld_from_databus__(databusURI) + json_str = get_json_ld_from_databus(databusURI, databus_key=databus_key) artifacts = __get_databus_artifacts_of_group__(json_str) for artifact_uri in artifacts: print(f"Processing artifact {artifact_uri}") - json_str = __get_json_ld_from_databus__(artifact_uri) + json_str = get_json_ld_from_databus(artifact_uri, databus_key=databus_key) latest = __get_databus_latest_version_of_artifact__(json_str) print(f"No version given, using latest version: {latest}") - json_str = __get_json_ld_from_databus__(latest) + json_str = get_json_ld_from_databus(latest, databus_key=databus_key) res = __handle_databus_artifact_version__(json_str) __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) From 9d6de501056efa3f24b506141992376ce1f63654 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Thu, 4 Dec 2025 18:39:12 +0100 Subject: [PATCH 6/8] fix: coderabbit --- README.md | 16 +++++++--------- databusclient/api/delete.py | 11 +++++------ databusclient/api/utils.py | 4 ++-- databusclient/cli.py | 2 +- databusclient/client.py | 15 +++++++++------ 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 1d97598..8add7c5 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,8 @@ Commands to download the [DBpedia Knowledge Graphs](#dbpedia-knowledge-graphs) g To download BUSL 1.1 licensed datasets, you need to register and get an access token. -1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org -2. Log in at https://account.dbpedia.org and create your token. +1. If you do not have a DBpedia Account yet (Forum/Databus), please register at [https://account.dbpedia.org](https://account.dbpedia.org) +2. Log in at [https://account.dbpedia.org](https://account.dbpedia.org) and create your token. 3. Save the token to a file, e.g. `vault-token.dat`. ### DBpedia Knowledge Graphs @@ -182,7 +182,7 @@ Options: --databus TEXT Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql) --vault-token TEXT Path to Vault refresh token file - --databus-key TEXT Databus API key to donwload from protected databus + --databus-key TEXT Databus API key to download from protected databus --authurl TEXT Keycloak token endpoint URL [default: https://auth.dbpedia.org/realms/dbpedia/protocol/openid- connect/token] @@ -191,7 +191,7 @@ Options: --help Show this message and exit. ``` -### Examples of using the download command +#### Examples of using the download command **Download File**: download of a single file ```bash @@ -397,7 +397,7 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ ./data_folder ``` - + ### Delete With the delete command you can delete collections, groups, artifacts, and versions from the Databus. Deleting files is not supported via API. @@ -437,11 +437,9 @@ To authenticate the delete request, you need to provide an API key with `--datab If you want to perform a dry run without actual deletion, use the `--dry-run` option. This will show you what would be deleted without making any changes. -As securety measure, the delete command will prompt you for confirmation before proceeding with the deletion. If you want to skip this prompt, you can use the `--force` option. +As security measure, the delete command will prompt you for confirmation before proceeding with the deletion. If you want to skip this prompt, you can use the `--force` option. -**Example of using the delete command** - -### Examples of using the download command +#### Examples of using the delete command **Delete Version**: delete a specific version ```bash diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py index fcbaf08..9d5836f 100644 --- a/databusclient/api/delete.py +++ b/databusclient/api/delete.py @@ -65,7 +65,7 @@ def _delete_resource(databusURI: str, databus_key: str, dry_run: bool = False, f print(f"[DRY RUN] Would delete: {databusURI}") return - response = requests.delete(databusURI, headers=headers) + response = requests.delete(databusURI, headers=headers, timeout=30) if response.status_code in (200, 204): print(f"Successfully deleted: {databusURI}") @@ -112,11 +112,10 @@ def _delete_artifact(databusURI: str, databus_key: str, dry_run: bool = False, f raise ValueError("No versions found in artifact JSON-LD") # Delete all versions - _delete_list(version_uris, databus_key, dry_run=dry_run) + _delete_list(version_uris, databus_key, dry_run=dry_run, force=force) # Finally, delete the artifact itself - _delete_resource(databusURI, databus_key, dry_run=dry_run) - + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) def _delete_group(databusURI: str, databus_key: str, dry_run: bool = False, force: bool = False): """ @@ -167,7 +166,7 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool) """ for databusURI in databusURIs: - host, account, group, artifact, version, file = get_databus_id_parts_from_uri(databusURI) + _host, _account, group, artifact, version, file = get_databus_id_parts_from_uri(databusURI) if group == "collections" and artifact is not None: print(f"Deleting collection: {databusURI}") @@ -185,4 +184,4 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool) print(f"Deleting group and all its artifacts and versions: {databusURI}") _delete_group(databusURI, databus_key, dry_run=dry_run, force=force) else: - print(f"Deleting ${databusURI} is not supported.") + print(f"Deleting {databusURI} is not supported.") diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index a5646b2..1ffe421 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -17,7 +17,7 @@ def get_databus_id_parts_from_uri(uri: str) -> Tuple[Optional[str], Optional[str parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts return tuple(parts[:6]) # return only the first 6 parts -def get_json_ld_from_databus(uri: str, databus_key: str = None) -> str: +def get_json_ld_from_databus(uri: str, databus_key: str | None = None) -> str: """ Retrieve JSON-LD representation of a databus resource. @@ -31,7 +31,7 @@ def get_json_ld_from_databus(uri: str, databus_key: str = None) -> str: headers = {"Accept": "application/ld+json"} if databus_key is not None: headers["X-API-KEY"] = databus_key - response = requests.get(uri, headers=headers) + response = requests.get(uri, headers=headers, timeout=30) response.raise_for_status() return response.text diff --git a/databusclient/cli.py b/databusclient/cli.py index aa0551b..c983544 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -96,7 +96,7 @@ def deploy(version_id, title, abstract, description, license_url, apikey, @click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") @click.option("--databus", help="Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql)") @click.option("--vault-token", help="Path to Vault refresh token file") -@click.option("--databus-key", help="Databus API key to donwload from protected databus") +@click.option("--databus-key", help="Databus API key to download from protected databus") @click.option("--authurl", default="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", show_default=True, help="Keycloak token endpoint URL") @click.option("--clientid", default="vault-token-exchange", show_default=True, help="Client ID for token exchange") def download(databusuris: List[str], localdir, databus, vault_token, databus_key, authurl, clientid): diff --git a/databusclient/client.py b/databusclient/client.py index 25e3f8d..f666d5e 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -568,8 +568,7 @@ def __download_file__(url, filename, vault_token_file=None, databus_key=None, au # TODO: could be a problem of github raw / openflaas if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: - # raise IOError("Downloaded size does not match Content-Length header") - print(f"Warning: Downloaded size does not match Content-Length header:\nExpected {total_size_in_bytes}, got {progress_bar.n}") + raise IOError("Downloaded size does not match Content-Length header") def __get_vault_access__(download_url: str, @@ -622,13 +621,14 @@ def __get_vault_access__(download_url: str, return vault_token -def __query_sparql__(endpoint_url, query) -> dict: +def __query_sparql__(endpoint_url, query, databus_key=None) -> dict: """ Query a SPARQL endpoint and return results in JSON format. Parameters: - endpoint_url: the URL of the SPARQL endpoint - query: the SPARQL query string + - databus_key: Optional API key for authentication Returns: - Dictionary containing the query results @@ -637,12 +637,14 @@ def __query_sparql__(endpoint_url, query) -> dict: sparql.method = 'POST' sparql.setQuery(query) sparql.setReturnFormat(JSON) + if databus_key is not None: + sparql.setCustomHttpHeaders({"X-API-KEY": databus_key}) results = sparql.query().convert() return results -def __handle_databus_file_query__(endpoint_url, query) -> List[str]: - result_dict = __query_sparql__(endpoint_url, query) +def __handle_databus_file_query__(endpoint_url, query, databus_key=None) -> List[str]: + result_dict = __query_sparql__(endpoint_url, query, databus_key=databus_key) for binding in result_dict['results']['bindings']: if len(binding.keys()) > 1: print("Error multiple bindings in query response") @@ -760,6 +762,7 @@ def download( endpoint: the databus endpoint URL databusURIs: identifiers to access databus registered datasets token: Path to Vault refresh token file + databus_key: Databus API key for protected downloads auth_url: Keycloak token endpoint URL client_id: Client ID for token exchange """ @@ -823,5 +826,5 @@ def download( print("QUERY {}", databusURI.replace("\n", " ")) if endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") - res = __handle_databus_file_query__(endpoint, databusURI) + res = __handle_databus_file_query__(endpoint, databusURI, databus_key=databus_key) __download_list__(res, localDir, vault_token_file=token, databus_key=databus_key, auth_url=auth_url, client_id=client_id) From 565ad59c92da97756ff37b3528c20e3d8b139b85 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Thu, 4 Dec 2025 18:54:25 +0100 Subject: [PATCH 7/8] fix: coderabbit --- databusclient/api/delete.py | 19 ++++++++++--------- databusclient/client.py | 12 ++++++------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py index 9d5836f..556841e 100644 --- a/databusclient/api/delete.py +++ b/databusclient/api/delete.py @@ -20,11 +20,11 @@ def _confirm_delete(databusURI: str) -> str: print("\nThis action is irreversible and will permanently remove the resource and all its data.") while True: choice = input("Type 'yes'/'y' to confirm, 'skip'/'s' to skip this resource, or 'cancel'/'c' to abort: ").strip().lower() - if choice == "yes" or choice == "y": + if choice in ("yes", "y"): return "confirm" - elif choice == "skip" or choice == "s": + elif choice in ("skip", "s"): return "skip" - elif choice == "cancel" or choice == "c": + elif choice in ("cancel", "c"): return "cancel" else: print("Invalid input. Please type 'yes'/'y', 'skip'/'s', or 'cancel'/'c'.") @@ -107,12 +107,14 @@ def _delete_artifact(databusURI: str, databus_key: str, dry_run: bool = False, f versions = [versions] # Multiple versions case [{}, {}] - version_uris = [v["@id"] for v in versions if "@id" in v] - if not version_uris: - raise ValueError("No versions found in artifact JSON-LD") + # If versions is None or empty skip + if not versions: + version_uris = [v["@id"] for v in versions if "@id" in v] + if not version_uris: + raise ValueError("No versions found in artifact JSON-LD") - # Delete all versions - _delete_list(version_uris, databus_key, dry_run=dry_run, force=force) + # Delete all versions + _delete_list(version_uris, databus_key, dry_run=dry_run, force=force) # Finally, delete the artifact itself _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) @@ -150,7 +152,6 @@ def _delete_group(databusURI: str, databus_key: str, dry_run: bool = False, forc # Finally, delete the group itself _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force) -# TODO: add to README.md def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool): """ Delete a dataset from the databus. diff --git a/databusclient/client.py b/databusclient/client.py index f666d5e..994e731 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -522,7 +522,7 @@ def __download_file__(url, filename, vault_token_file=None, databus_key=None, au print("Redirects url: ", url) # --- 2. Try direct GET --- - response = requests.get(url, stream=True, allow_redirects=True) + response = requests.get(url, stream=True, allow_redirects=True, timeout=30) www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth # Vault token required if 401 Unauthorized with Bearer challenge @@ -536,7 +536,7 @@ def __download_file__(url, filename, vault_token_file=None, databus_key=None, au headers = {"Authorization": f"Bearer {vault_token}"} # --- 4. Retry with token --- - response = requests.get(url, headers=headers, stream=True) + response = requests.get(url, headers=headers, stream=True, timeout=30) # Databus API key required if only 401 Unauthorized elif response.status_code == 401: @@ -545,7 +545,7 @@ def __download_file__(url, filename, vault_token_file=None, databus_key=None, au raise ValueError("Databus API key not given for protected download") headers = {"X-API-KEY": databus_key} - response = requests.get(url, headers=headers, stream=True) + response = requests.get(url, headers=headers, stream=True, timeout=30) try: response.raise_for_status() # Raise if still failing @@ -718,12 +718,12 @@ def wsha256(raw: str): return sha256(raw.encode('utf-8')).hexdigest() -def __handle_databus_collection__(uri: str, databus_key: str = None) -> str: +def __handle_databus_collection__(uri: str, databus_key: str | None = None) -> str: headers = {"Accept": "text/sparql"} if databus_key is not None: headers["X-API-KEY"] = databus_key - return requests.get(uri, headers=headers).text + return requests.get(uri, headers=headers, timeout=30).text def __download_list__(urls: List[str], @@ -735,7 +735,7 @@ def __download_list__(urls: List[str], fileLocalDir = localDir for url in urls: if localDir is None: - host, account, group, artifact, version, file = get_databus_id_parts_from_uri(url) + _host, account, group, artifact, version, file = get_databus_id_parts_from_uri(url) fileLocalDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") print(f"Local directory not given, using {fileLocalDir}") From b6eb33e7b6131d7a82f171c9060f68158be04334 Mon Sep 17 00:00:00 2001 From: Integer-Ctrl Date: Thu, 4 Dec 2025 19:02:34 +0100 Subject: [PATCH 8/8] fix: coderabbit --- databusclient/api/delete.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py index 556841e..a3d7625 100644 --- a/databusclient/api/delete.py +++ b/databusclient/api/delete.py @@ -108,13 +108,15 @@ def _delete_artifact(databusURI: str, databus_key: str, dry_run: bool = False, f # Multiple versions case [{}, {}] # If versions is None or empty skip - if not versions: + if versions is None: + print(f"No versions found for artifact: {databusURI}") + else: version_uris = [v["@id"] for v in versions if "@id" in v] if not version_uris: - raise ValueError("No versions found in artifact JSON-LD") - - # Delete all versions - _delete_list(version_uris, databus_key, dry_run=dry_run, force=force) + print(f"No version URIs found in artifact JSON-LD for: {databusURI}") + else: + # Delete all versions + _delete_list(version_uris, databus_key, dry_run=dry_run, force=force) # Finally, delete the artifact itself _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force)