diff --git a/scripts/1-fetch/arxiv_fetch.py b/scripts/1-fetch/arxiv_fetch.py index 23215261..4c97cc96 100755 --- a/scripts/1-fetch/arxiv_fetch.py +++ b/scripts/1-fetch/arxiv_fetch.py @@ -125,31 +125,6 @@ def parse_arguments(): return args -def initialize_data_file(file_path, headers): - """Initialize CSV file with headers if it doesn't exist.""" - if not os.path.isfile(file_path): - with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj: - writer = csv.DictWriter( - file_obj, fieldnames=headers, dialect="unix" - ) - writer.writeheader() - - -def initialize_all_data_files(args): - """Initialize all data files used by this script. - - Creates the data directory and initializes empty CSVs with headers. - """ - if not args.enable_save: - return - - os.makedirs(PATHS["data_1-fetch"], exist_ok=True) - initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT) - initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT) - initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR) - initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET) - - def get_identifier_mapping(): global IDENTIER_MAPPING LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping") @@ -472,19 +447,6 @@ def query_arxiv(args, session): return data, cc_articles_found -def rows_to_csv(args, fieldnames, rows, file_path): - if not args.enable_save: - return args - - with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle: - writer = csv.DictWriter( - file_handle, fieldnames=fieldnames, dialect="unix" - ) - writer.writeheader() - for row in rows: - writer.writerow(row) - - def write_data(args, data): """ Write fetched data to CSV files. @@ -508,7 +470,9 @@ def write_data(args, data): } ) rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET")) - rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET) + shared.rows_to_csv( + args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET + ) # Save category report # fetched_data["category_counts"]: {identifer: {category_code: count}} @@ -527,7 +491,9 @@ def write_data(args, data): } ) rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE")) - rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT) + shared.rows_to_csv( + args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT + ) # Save tool counts report # fetched_data["tool_counts"]: {identfier: count} @@ -535,7 +501,7 @@ def write_data(args, data): for identifier, count in data["tool_counts"].items(): rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count}) rows.sort(key=itemgetter("TOOL_IDENTIFIER")) - rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT) + shared.rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT) # Save year count report # fetched_data["year_counts"]: {identifer: {year: count}} @@ -546,7 +512,7 @@ def write_data(args, data): {"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count} ) rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR")) - rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR) + shared.rows_to_csv(args, HEADER_YEAR, FILE_ARXIV_YEAR, rows) def write_provence(args, cc_articles_found): @@ -584,7 +550,6 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - initialize_all_data_files(args) get_identifier_mapping() session = shared.get_session() query_category_mapping(args, session) diff --git a/scripts/1-fetch/github_fetch.py b/scripts/1-fetch/github_fetch.py index 9ac62bd0..9582d784 100755 --- a/scripts/1-fetch/github_fetch.py +++ b/scripts/1-fetch/github_fetch.py @@ -28,7 +28,7 @@ LOGGER, PATHS = shared.setup(__file__) # Constants -FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv") +FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv") GH_TOKEN = os.getenv("GH_TOKEN") # Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license GITHUB_TOOLS = [ @@ -40,7 +40,7 @@ {"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"}, {"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"}, ] -HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"] +HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"] QUARTER = os.path.basename(PATHS["data_quarter"]) @@ -68,7 +68,7 @@ def parse_arguments(): def check_for_completion(): try: - with open(FILE1_COUNT, "r", newline="") as file_obj: + with open(FILE_COUNT, "r", newline="") as file_obj: reader = csv.DictReader(file_obj, dialect="unix") if len(list(reader)) == len(GITHUB_TOOLS): raise shared.QuantifyingException( @@ -78,27 +78,6 @@ def check_for_completion(): pass # File may not be found without --enable-save, etc. -def write_data(args, tool_data): - if not args.enable_save: - return args - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if len(tool_data) < len(GITHUB_TOOLS): - LOGGER.error("Unable to fetch all records. Aborting.") - return args - - with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj: - writer = csv.DictWriter( - file_obj, fieldnames=HEADER1_COUNT, dialect="unix" - ) - writer.writeheader() - for row in tool_data: - writer.writerow(row) - return args - - def query_github(args, session): tool_data = [] for tool in GITHUB_TOOLS: @@ -148,7 +127,10 @@ def main(): session.headers.update({"authorization": f"Bearer {GH_TOKEN}"}) tool_data = query_github(args, session) - args = write_data(args, tool_data) + if len(tool_data) < len(GITHUB_TOOLS): + LOGGER.error("Unable to fetch all records. Aborting.") + return args + shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data) args = shared.git_add_and_commit( args, PATHS["repo"], diff --git a/scripts/1-fetch/openverse_fetch.py b/scripts/1-fetch/openverse_fetch.py index fae1bf15..2285ba9c 100755 --- a/scripts/1-fetch/openverse_fetch.py +++ b/scripts/1-fetch/openverse_fetch.py @@ -13,7 +13,6 @@ # Standard library import argparse -import csv import os import sys import textwrap @@ -192,27 +191,12 @@ def query_openverse(session): return aggregate -def write_data(args, data): - if not args.enable_save: - return - os.makedirs(PATHS["data_phase"], exist_ok=True) - with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj: - writer = csv.DictWriter( - file_obj, - fieldnames=OPENVERSE_FIELDS, - dialect="unix", - ) - writer.writeheader() - for row in data: - writer.writerow(row) - - def main(): args = parse_arguments() LOGGER.info("Starting Openverse Fetch Script...") session = shared.get_session(accept_header="application/json") records = query_openverse(session) - write_data(args, records) + shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records) LOGGER.info(f"Fetched {len(records)} unique Openverse records.") diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py index 7b74d356..bab9bf7d 100755 --- a/scripts/1-fetch/smithsonian_fetch.py +++ b/scripts/1-fetch/smithsonian_fetch.py @@ -95,32 +95,6 @@ def check_for_completion(): ) -def write_data(args, data_metrics, data_units): - if not args.enable_save: - return args - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj: - writer = csv.DictWriter( - file_obj, fieldnames=HEADER_1_METRICS, dialect="unix" - ) - writer.writeheader() - for row in data_metrics: - writer.writerow(row) - - with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj: - writer = csv.DictWriter( - file_obj, fieldnames=HEADER_2_UNITS, dialect="unix" - ) - writer.writeheader() - for row in data_units: - writer.writerow(row) - - return args - - def query_smithsonian(args, session): if not DATA_GOV_API_KEY: raise shared.QuantifyingException( @@ -177,7 +151,8 @@ def main(): check_for_completion() session = shared.get_session() data_metrics, data_units = query_smithsonian(args, session) - args = write_data(args, data_metrics, data_units) + shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics) + shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units) args = shared.git_add_and_commit( args, PATHS["repo"], diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py index efabc327..d3032b5e 100755 --- a/scripts/1-fetch/wikipedia_fetch.py +++ b/scripts/1-fetch/wikipedia_fetch.py @@ -65,7 +65,9 @@ def parse_arguments(): def check_for_completion(): try: - with open(FILE_LANGUAGES, "r", newline="") as file_obj: + with open( + FILE_LANGUAGES, "r", encoding="utf-8", newline="" + ) as file_obj: reader = csv.DictReader(file_obj, dialect="unix") if len(list(reader)) > 300: raise shared.QuantifyingException( @@ -75,22 +77,6 @@ def check_for_completion(): pass # File may not be found without --enable-save, etc. -def write_data(args, tool_data): - if not args.enable_save: - return args - LOGGER.info("Saving fetched data") - os.makedirs(PATHS["data_phase"], exist_ok=True) - - with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj: - writer = csv.DictWriter( - file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix" - ) - writer.writeheader() - for row in tool_data: - writer.writerow(row) - return args - - def query_wikipedia_languages(session): LOGGER.info("Fetching article counts from all language Wikipedias") tool_data = [] @@ -173,7 +159,7 @@ def main(): shared.git_fetch_and_merge(args, PATHS["repo"]) session = shared.get_session() tool_data = query_wikipedia_languages(session) - args = write_data(args, tool_data) + shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data) args = shared.git_add_and_commit( args, PATHS["repo"], diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 12fd3942..a9a60c9a 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -121,7 +121,7 @@ def process_product_totals(args, count_data): data.items(), columns=["CC legal tool product", "Count"] ) file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def process_latest_prior_retired_totals(args, count_data): @@ -202,7 +202,7 @@ def process_latest_prior_retired_totals(args, count_data): file_path = shared.path_join( PATHS["data_phase"], f"gcs_status_{key}_totals.csv" ) - shared.data_to_csv(args, dataframe, file_path) + shared.dataframe_to_csv(args, dataframe, file_path) def process_totals_by_free_cultural(args, count_data): @@ -235,7 +235,7 @@ def process_totals_by_free_cultural(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_free_cultural.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def process_totals_by_restrictions(args, count_data): @@ -269,7 +269,7 @@ def process_totals_by_restrictions(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_restrictions.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def process_totals_by_language(args, data): @@ -290,7 +290,7 @@ def process_totals_by_language(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_language.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def process_totals_by_country(args, data): @@ -311,7 +311,7 @@ def process_totals_by_country(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_country.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def main(): diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 85c0f285..8fc9348b 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -96,7 +96,7 @@ def process_totals_by_license(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def process_totals_by_restriction(args, count_data): @@ -130,7 +130,7 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) - shared.data_to_csv(args, data, file_path) + shared.dataframe_to_csv(args, data, file_path) def main(): diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 085956de..c625b967 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -103,7 +103,7 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) - shared.data_to_csv(args, top_10, file_path) + shared.dataframe_to_csv(args, top_10, file_path) def process_least_language_usage(args, count_data): @@ -126,7 +126,7 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) - shared.data_to_csv(args, bottom_10, file_path) + shared.dataframe_to_csv(args, bottom_10, file_path) def process_language_representation(args, count_data): @@ -152,7 +152,7 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) - shared.data_to_csv(args, language_counts, file_path) + shared.dataframe_to_csv(args, language_counts, file_path) def main(): diff --git a/scripts/shared.py b/scripts/shared.py index 1ba14326..3d7140d6 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -37,7 +37,7 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) -def data_to_csv(args, data, file_path): +def dataframe_to_csv(args, data, file_path): if not args.enable_save: return os.makedirs(args.paths["data_phase"], exist_ok=True) @@ -233,6 +233,24 @@ def paths_list_update(logger, paths_list, old_quarter, new_quarter): return paths_list +def rows_to_csv(args, file_path, fieldnames, rows): + """Write rows to a CSV file if saving is enabled.""" + if not args.enable_save: + return + + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj: + writer = csv.DictWriter( + file_obj, + fieldnames=fieldnames, + dialect="unix", + ) + writer.writeheader() + for row in rows: + writer.writerow(row) + + class ColoredFormatter(logging.Formatter): """Adds colors to log messages."""