Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 8 additions & 43 deletions scripts/1-fetch/arxiv_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,31 +125,6 @@ def parse_arguments():
return args


def initialize_data_file(file_path, headers):
"""Initialize CSV file with headers if it doesn't exist."""
if not os.path.isfile(file_path):
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=headers, dialect="unix"
)
writer.writeheader()


def initialize_all_data_files(args):
"""Initialize all data files used by this script.

Creates the data directory and initializes empty CSVs with headers.
"""
if not args.enable_save:
return

os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)


def get_identifier_mapping():
global IDENTIER_MAPPING
LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping")
Expand Down Expand Up @@ -472,19 +447,6 @@ def query_arxiv(args, session):
return data, cc_articles_found


def rows_to_csv(args, fieldnames, rows, file_path):
if not args.enable_save:
return args

with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle:
writer = csv.DictWriter(
file_handle, fieldnames=fieldnames, dialect="unix"
)
writer.writeheader()
for row in rows:
writer.writerow(row)


def write_data(args, data):
"""
Write fetched data to CSV files.
Expand All @@ -508,7 +470,9 @@ def write_data(args, data):
}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET)
shared.rows_to_csv(
args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET
)

# Save category report
# fetched_data["category_counts"]: {identifer: {category_code: count}}
Expand All @@ -527,15 +491,17 @@ def write_data(args, data):
}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT)
shared.rows_to_csv(
args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT
)

# Save tool counts report
# fetched_data["tool_counts"]: {identfier: count}
rows = []
for identifier, count in data["tool_counts"].items():
rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count})
rows.sort(key=itemgetter("TOOL_IDENTIFIER"))
rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
shared.rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)

# Save year count report
# fetched_data["year_counts"]: {identifer: {year: count}}
Expand All @@ -546,7 +512,7 @@ def write_data(args, data):
{"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
shared.rows_to_csv(args, HEADER_YEAR, FILE_ARXIV_YEAR, rows)


def write_provence(args, cc_articles_found):
Expand Down Expand Up @@ -584,7 +550,6 @@ def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
initialize_all_data_files(args)
get_identifier_mapping()
session = shared.get_session()
query_category_mapping(args, session)
Expand Down
32 changes: 7 additions & 25 deletions scripts/1-fetch/github_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
LOGGER, PATHS = shared.setup(__file__)

# Constants
FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
GH_TOKEN = os.getenv("GH_TOKEN")
# Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
GITHUB_TOOLS = [
Expand All @@ -40,7 +40,7 @@
{"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
{"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
]
HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
QUARTER = os.path.basename(PATHS["data_quarter"])


Expand Down Expand Up @@ -68,7 +68,7 @@ def parse_arguments():

def check_for_completion():
try:
with open(FILE1_COUNT, "r", newline="") as file_obj:
with open(FILE_COUNT, "r", newline="") as file_obj:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update per #217:

        with open(FILE_COUNT, "r", encoding="utf-8") as file_obj:

reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) == len(GITHUB_TOOLS):
raise shared.QuantifyingException(
Expand All @@ -78,27 +78,6 @@ def check_for_completion():
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args

# Create data directory for this phase
os.makedirs(PATHS["data_phase"], exist_ok=True)

if len(tool_data) < len(GITHUB_TOOLS):
LOGGER.error("Unable to fetch all records. Aborting.")
return args

with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
)
writer.writeheader()
for row in tool_data:
writer.writerow(row)
return args


def query_github(args, session):
tool_data = []
for tool in GITHUB_TOOLS:
Expand Down Expand Up @@ -148,7 +127,10 @@ def main():
session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})

tool_data = query_github(args, session)
args = write_data(args, tool_data)
if len(tool_data) < len(GITHUB_TOOLS):
LOGGER.error("Unable to fetch all records. Aborting.")
return args
shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
18 changes: 1 addition & 17 deletions scripts/1-fetch/openverse_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

# Standard library
import argparse
import csv
import os
import sys
import textwrap
Expand Down Expand Up @@ -192,27 +191,12 @@ def query_openverse(session):
return aggregate


def write_data(args, data):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
writer = csv.DictWriter(
file_obj,
fieldnames=OPENVERSE_FIELDS,
dialect="unix",
)
writer.writeheader()
for row in data:
writer.writerow(row)


def main():
args = parse_arguments()
LOGGER.info("Starting Openverse Fetch Script...")
session = shared.get_session(accept_header="application/json")
records = query_openverse(session)
write_data(args, records)
shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records)
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")


Expand Down
29 changes: 2 additions & 27 deletions scripts/1-fetch/smithsonian_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,32 +95,6 @@ def check_for_completion():
)


def write_data(args, data_metrics, data_units):
if not args.enable_save:
return args

# Create data directory for this phase
os.makedirs(PATHS["data_phase"], exist_ok=True)

with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
)
writer.writeheader()
for row in data_metrics:
writer.writerow(row)

with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
)
writer.writeheader()
for row in data_units:
writer.writerow(row)

return args


def query_smithsonian(args, session):
if not DATA_GOV_API_KEY:
raise shared.QuantifyingException(
Expand Down Expand Up @@ -177,7 +151,8 @@ def main():
check_for_completion()
session = shared.get_session()
data_metrics, data_units = query_smithsonian(args, session)
args = write_data(args, data_metrics, data_units)
shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics)
shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
22 changes: 4 additions & 18 deletions scripts/1-fetch/wikipedia_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def parse_arguments():

def check_for_completion():
try:
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
with open(
FILE_LANGUAGES, "r", encoding="utf-8", newline=""
) as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 300:
raise shared.QuantifyingException(
Expand All @@ -75,22 +77,6 @@ def check_for_completion():
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args
LOGGER.info("Saving fetched data")
os.makedirs(PATHS["data_phase"], exist_ok=True)

with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
)
writer.writeheader()
for row in tool_data:
writer.writerow(row)
return args


def query_wikipedia_languages(session):
LOGGER.info("Fetching article counts from all language Wikipedias")
tool_data = []
Expand Down Expand Up @@ -173,7 +159,7 @@ def main():
shared.git_fetch_and_merge(args, PATHS["repo"])
session = shared.get_session()
tool_data = query_wikipedia_languages(session)
args = write_data(args, tool_data)
shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
12 changes: 6 additions & 6 deletions scripts/2-process/gcs_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def process_product_totals(args, count_data):
data.items(), columns=["CC legal tool product", "Count"]
)
file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def process_latest_prior_retired_totals(args, count_data):
Expand Down Expand Up @@ -202,7 +202,7 @@ def process_latest_prior_retired_totals(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
)
shared.data_to_csv(args, dataframe, file_path)
shared.dataframe_to_csv(args, dataframe, file_path)


def process_totals_by_free_cultural(args, count_data):
Expand Down Expand Up @@ -235,7 +235,7 @@ def process_totals_by_free_cultural(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def process_totals_by_restrictions(args, count_data):
Expand Down Expand Up @@ -269,7 +269,7 @@ def process_totals_by_restrictions(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def process_totals_by_language(args, data):
Expand All @@ -290,7 +290,7 @@ def process_totals_by_language(args, data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_language.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def process_totals_by_country(args, data):
Expand All @@ -311,7 +311,7 @@ def process_totals_by_country(args, data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_country.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def main():
Expand Down
4 changes: 2 additions & 2 deletions scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def process_totals_by_license(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_license.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def process_totals_by_restriction(args, count_data):
Expand Down Expand Up @@ -130,7 +130,7 @@ def process_totals_by_restriction(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_restriction.csv"
)
shared.data_to_csv(args, data, file_path)
shared.dataframe_to_csv(args, data, file_path)


def main():
Expand Down
6 changes: 3 additions & 3 deletions scripts/2-process/wikipedia_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def process_highest_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
)
shared.data_to_csv(args, top_10, file_path)
shared.dataframe_to_csv(args, top_10, file_path)


def process_least_language_usage(args, count_data):
Expand All @@ -126,7 +126,7 @@ def process_least_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
)
shared.data_to_csv(args, bottom_10, file_path)
shared.dataframe_to_csv(args, bottom_10, file_path)


def process_language_representation(args, count_data):
Expand All @@ -152,7 +152,7 @@ def process_language_representation(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
)
shared.data_to_csv(args, language_counts, file_path)
shared.dataframe_to_csv(args, language_counts, file_path)


def main():
Expand Down
Loading