From a474c07b9a0293b9a4dd90099d8e24bbf62a35bd Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Tue, 3 Feb 2026 17:49:11 -0500 Subject: [PATCH 01/16] updated parse command to take multip le files --- scripts/parse_cohort_files.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 46f83c0..5640b15 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -83,17 +83,23 @@ def ci_tags_to_primary_ids(samples, file_group): return primary_ids -def parse_cohort_file(input_file, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): - # Parse cohort file - samples = get_list_of_samples_from_cohort_file(input_file) - # Convert from ciTags to primaryIds - primary_ids = ci_tags_to_primary_ids(samples, file_group) +import os + +def parse_cohort_files(input_directory, output_file): + all_samples = [] + + for file in os.listdir(input_directory): + if file.endswith(".txt"): + file_path = os.path.join(input_directory, file) + samples = get_list_of_samples_from_cohort_file(file_path) + all_samples.extend(samples) + + # Write all samples to the output file with open(output_file, "w") as f: - for sample in primary_ids: + for sample in all_samples: f.write(f"{sample}\n") - print(f"File {output_file} successfully generated. Number of samples to run {len(primary_ids)}") - + print(f"File {output_file} successfully generated. Number of samples to run {len(all_samples)}") HELP = """USAGE: python3 parse_cohort_files.py parse [] python3 parse_cohort_files.py remove [] From 19c96f35a26e25db206faf3147055a78fa271b0a Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:20:38 -0500 Subject: [PATCH 02/16] modify function name --- scripts/parse_cohort_files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 5640b15..9093f43 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -112,9 +112,9 @@ def parse_cohort_files(input_directory, output_file): exit(1) command = sys.argv[1] if command == "parse": - input_file = sys.argv[2] + input_directory = sys.argv[2] output_file = sys.argv[3] - parse_cohort_file(input_file, output_file) + parse_cohort_files(input_directory, output_file) elif command == "remove": input_file = sys.argv[2] if len(sys.argv) > 2: From a48c6b5445e493d9524a78d5b88bc8bf4f1156d8 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Tue, 3 Feb 2026 20:13:45 -0500 Subject: [PATCH 03/16] accept multiple files --- scripts/parse_cohort_files.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 9093f43..b1fcbf7 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -83,23 +83,20 @@ def ci_tags_to_primary_ids(samples, file_group): return primary_ids -import os - -def parse_cohort_files(input_directory, output_file): - all_samples = [] - - for file in os.listdir(input_directory): - if file.endswith(".txt"): - file_path = os.path.join(input_directory, file) - samples = get_list_of_samples_from_cohort_file(file_path) - all_samples.extend(samples) - - # Write all samples to the output file +def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): + all_primary_ids = [] + for input_file in input_files: + # Parse cohort file + samples = get_list_of_samples_from_cohort_file(input_file) + # Convert from ciTags to primaryIds + primary_ids = ci_tags_to_primary_ids(samples, file_group) + all_primary_ids.extend(primary_ids) with open(output_file, "w") as f: - for sample in all_samples: + for sample in all_primary_ids: f.write(f"{sample}\n") + print(f"File {output_file} successfully generated. Number of samples to run {len(all_primary_ids)}") + - print(f"File {output_file} successfully generated. Number of samples to run {len(all_samples)}") HELP = """USAGE: python3 parse_cohort_files.py parse [] python3 parse_cohort_files.py remove [] @@ -112,9 +109,9 @@ def parse_cohort_files(input_directory, output_file): exit(1) command = sys.argv[1] if command == "parse": - input_directory = sys.argv[2] - output_file = sys.argv[3] - parse_cohort_files(input_directory, output_file) + input_files = sys.argv[2:-1] # all input files + output_file = sys.argv[-1] # last argument + parse_cohort_file(input_files, output_file) elif command == "remove": input_file = sys.argv[2] if len(sys.argv) > 2: @@ -130,4 +127,4 @@ def parse_cohort_files(input_directory, output_file): else: create_check_script(input_file) else: - print(HELP) + print(HELP) \ No newline at end of file From e2d4b0e98dd22e321bcb0b11eff4d1b5c9d66cd8 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Thu, 5 Feb 2026 13:10:13 -0500 Subject: [PATCH 04/16] Change primary IDs to sample IDs in cohort parsing --- scripts/parse_cohort_files.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index b1fcbf7..5b0d90c 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -84,17 +84,17 @@ def ci_tags_to_primary_ids(samples, file_group): def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): - all_primary_ids = [] + all_sample_ids = [] for input_file in input_files: # Parse cohort file samples = get_list_of_samples_from_cohort_file(input_file) # Convert from ciTags to primaryIds - primary_ids = ci_tags_to_primary_ids(samples, file_group) - all_primary_ids.extend(primary_ids) + #primary_ids = ci_tags_to_primary_ids(samples, file_group) + all_sample_ids.extend(samples) with open(output_file, "w") as f: - for sample in all_primary_ids: + for sample in all_sample_ids: f.write(f"{sample}\n") - print(f"File {output_file} successfully generated. Number of samples to run {len(all_primary_ids)}") + print(f"File {output_file} successfully generated. Number of samples to run {len(all_sample_ids)}") HELP = """USAGE: @@ -127,4 +127,4 @@ def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-8 else: create_check_script(input_file) else: - print(HELP) \ No newline at end of file + print(HELP) From f267b7603967da876a52fa93d186c22cf3221735 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:58:10 -0500 Subject: [PATCH 05/16] Add list_directories function to generate directory list --- scripts/parse_cohort_files.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 5b0d90c..3cae564 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -1,5 +1,6 @@ import re import sys +import os import bin.access_beagle_endpoint as beagle_api BEAGLE = beagle_api.AccessBeagleEndpoint() @@ -97,10 +98,22 @@ def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-8 print(f"File {output_file} successfully generated. Number of samples to run {len(all_sample_ids)}") +def list_directories(directories, output_file): + all_directories = [f for f in os.listdir(directories) if os.path.isdir(os.path.join(directories, f))] + + with open(output_file, "w") as f: + for directory in all_directories: + f.write(f"{directory}\n") + print(f"File {output_file} successfully generated. Number of directories in BAM folder {len(all_directories)}") + + + HELP = """USAGE: python3 parse_cohort_files.py parse [] python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] +python3 parse_cohort_files.py list_dir [] + """ if __name__ == "__main__": @@ -126,5 +139,12 @@ def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-8 create_check_script(input_file, output_file) else: create_check_script(input_file) + elif command == "list_dir": + directories = sys.argv[2] + if len(sys.argv) > 2: + output_file = sys.argv[3] + list_directories(directories, output_file) + else: + list_directories(directories) else: print(HELP) From 96b9c549752a38503ea07968edc6a192a6bac121 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:32:31 -0500 Subject: [PATCH 06/16] Added compare function to compare outputs --- scripts/parse_cohort_files.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 3cae564..6e4b30c 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -106,6 +106,27 @@ def list_directories(directories, output_file): f.write(f"{directory}\n") print(f"File {output_file} successfully generated. Number of directories in BAM folder {len(all_directories)}") +def compare_files(output_file_1, output_file_2, report_file): + # Read parsed output + with open(output_file_1) as f: + parsed_output = {line.strip() for line in f if line.strip()} + + # Read directory listing output + with open(output_file_2) as f: + directory_output = {line.strip() for line in f if line.strip()} + + # Compare + unique_to_parsed_output = parsed_output - directory_output + unique_to_directory_output = directory_output - parsed_output + + # Write results to file + with open(report_file, "w") as f: + f.write("Elements only in parsed output:\n") + f.write("\n".join(sorted(unique_to_parsed_output)) + "\n\n") + + f.write("Elements only in directory output:\n") + f.write("\n".join(sorted(unique_to_directory_output)) + "\n") + HELP = """USAGE: @@ -113,6 +134,7 @@ def list_directories(directories, output_file): python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] python3 parse_cohort_files.py list_dir [] +python3 parse_cohort_files.py compare_files [] """ @@ -141,10 +163,16 @@ def list_directories(directories, output_file): create_check_script(input_file) elif command == "list_dir": directories = sys.argv[2] - if len(sys.argv) > 2: + if len(sys.argv) > 3: output_file = sys.argv[3] list_directories(directories, output_file) else: list_directories(directories) + elif command == "compare": + output_file_1 = sys.argv[2] + output_file_2 = sys.argv[3] + if len(sys.argv) > 4: + report_file = sys.argv[4] + compare_files(output_file_1, output_file_2, report_file) else: print(HELP) From 5c5ab2437764551c5c26a629d94055c7be703099 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Fri, 6 Feb 2026 21:13:02 -0500 Subject: [PATCH 07/16] Removed restriction of which file to compare first --- scripts/parse_cohort_files.py | 67 +++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 6e4b30c..0aca1c1 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -106,26 +106,38 @@ def list_directories(directories, output_file): f.write(f"{directory}\n") print(f"File {output_file} successfully generated. Number of directories in BAM folder {len(all_directories)}") -def compare_files(output_file_1, output_file_2, report_file): - # Read parsed output - with open(output_file_1) as f: - parsed_output = {line.strip() for line in f if line.strip()} - - # Read directory listing output - with open(output_file_2) as f: - directory_output = {line.strip() for line in f if line.strip()} - - # Compare - unique_to_parsed_output = parsed_output - directory_output - unique_to_directory_output = directory_output - parsed_output - - # Write results to file - with open(report_file, "w") as f: - f.write("Elements only in parsed output:\n") - f.write("\n".join(sorted(unique_to_parsed_output)) + "\n\n") - - f.write("Elements only in directory output:\n") - f.write("\n".join(sorted(unique_to_directory_output)) + "\n") +def compare_files(file_1, file_2, report_file): + try: + # Read parsed output in read mode + with open(file_1, 'r') as f: + output1 = {line.strip() for line in f if line.strip()} + + # Read directory listing output in read mode + with open(file_2, 'r') as f: + output2 = {line.strip() for line in f if line.strip()} + + # Compare + unique_to_file1 = output1 - output2 + unique_to_file2 = output2 - output1 + + # Debugging prints to check the differences + print(f"Unique to {file_1}: {unique_to_file1}") + print(f"Unique to {file_2}: {unique_to_file2}") + + # Write results to file in write mode + with open(report_file, "w") as f: # 'w' means write mode + f.write(f"Elements only in {file_1}:\n") + f.write("\n".join(sorted(unique_to_file1)) + "\n\n") + + f.write(f"Elements only in {file_2}:\n") + f.write("\n".join(sorted(unique_to_file2)) + "\n") + + print(f"Comparison complete. Results written to {report_file}") + + except FileNotFoundError as e: + print(f"Error: {e}") + except Exception as e: + print(f"An unexpected error occurred: {e}") @@ -134,7 +146,7 @@ def compare_files(output_file_1, output_file_2, report_file): python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] python3 parse_cohort_files.py list_dir [] -python3 parse_cohort_files.py compare_files [] +python3 parse_cohort_files.py compare """ @@ -169,10 +181,13 @@ def compare_files(output_file_1, output_file_2, report_file): else: list_directories(directories) elif command == "compare": - output_file_1 = sys.argv[2] - output_file_2 = sys.argv[3] - if len(sys.argv) > 4: - report_file = sys.argv[4] - compare_files(output_file_1, output_file_2, report_file) + if len(sys.argv) < 5: # At least two files and one report file + print("Usage: python script.py compare file1 file2 report_file") + sys.exit(1) + file_1 = sys.argv[2] + file_2 = sys.argv[3] + report_file = sys.argv[4] + compare_files(file_1, file_2, report_file) else: print(HELP) + \ No newline at end of file From 507c81ffa46ed8526fbd3e2dcd0b753e3ee229eb Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Fri, 6 Feb 2026 22:59:20 -0500 Subject: [PATCH 08/16] update compare command --- scripts/parse_cohort_files.py | 52 ++++++++++++++--------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 0aca1c1..f9ec720 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -106,38 +106,26 @@ def list_directories(directories, output_file): f.write(f"{directory}\n") print(f"File {output_file} successfully generated. Number of directories in BAM folder {len(all_directories)}") -def compare_files(file_1, file_2, report_file): - try: - # Read parsed output in read mode - with open(file_1, 'r') as f: - output1 = {line.strip() for line in f if line.strip()} - - # Read directory listing output in read mode - with open(file_2, 'r') as f: - output2 = {line.strip() for line in f if line.strip()} - - # Compare - unique_to_file1 = output1 - output2 - unique_to_file2 = output2 - output1 - - # Debugging prints to check the differences - print(f"Unique to {file_1}: {unique_to_file1}") - print(f"Unique to {file_2}: {unique_to_file2}") - - # Write results to file in write mode - with open(report_file, "w") as f: # 'w' means write mode - f.write(f"Elements only in {file_1}:\n") - f.write("\n".join(sorted(unique_to_file1)) + "\n\n") - - f.write(f"Elements only in {file_2}:\n") - f.write("\n".join(sorted(unique_to_file2)) + "\n") - - print(f"Comparison complete. Results written to {report_file}") +def compare_files(file1, file2, report_file): + # Read file1 output + with open(file1) as f: + output1 = {line.strip() for line in f if line.strip()} + + # Read file2 output + with open(file2) as f: + output2 = {line.strip() for line in f if line.strip()} + # Compare + unique_to_file1 = output1 - output2 + unique_to_file2 = output2 - output1 - except FileNotFoundError as e: - print(f"Error: {e}") - except Exception as e: - print(f"An unexpected error occurred: {e}") + # Write results to files + with open(report_file, "w") as f: + f.write(f"Elements only in {file1}:\n") + f.write("\n".join(sorted(unique_to_file1)) + "\n\n") + + f.write(f"Elements only in {file2}:\n") + f.write("\n".join(sorted(unique_to_file2)) + "\n") + @@ -190,4 +178,4 @@ def compare_files(file_1, file_2, report_file): compare_files(file_1, file_2, report_file) else: print(HELP) - \ No newline at end of file + \ No newline at end of file From 0ae0f98480e7d1c0b3637b55a765c40bf3212862 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Fri, 6 Feb 2026 23:53:13 -0500 Subject: [PATCH 09/16] updated HELP --- scripts/parse_cohort_files.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index f9ec720..57d6bf9 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -131,6 +131,7 @@ def compare_files(file1, file2, report_file): HELP = """USAGE: python3 parse_cohort_files.py parse [] + - can be a single file, multiple files, or a wildcard (e.g., /path/to/files/*.txt) python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] python3 parse_cohort_files.py list_dir [] From 647ba3e798fc66d9a8210966fe9002370585c656 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Mon, 9 Feb 2026 16:30:29 -0500 Subject: [PATCH 10/16] merged functions --- scripts/parse_cohort_files.py | 85 +++++++++++++---------------------- 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 57d6bf9..00ec01e 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -84,53 +84,44 @@ def ci_tags_to_primary_ids(samples, file_group): return primary_ids -def parse_cohort_file(input_files, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): +def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): all_sample_ids = [] + + # Process cohort files for input_file in input_files: # Parse cohort file samples = get_list_of_samples_from_cohort_file(input_file) - # Convert from ciTags to primaryIds - #primary_ids = ci_tags_to_primary_ids(samples, file_group) + # Convert from ciTags to primaryIds if needed + # primary_ids = ci_tags_to_primary_ids(samples, file_group) all_sample_ids.extend(samples) + with open(output_file, "w") as f: for sample in all_sample_ids: f.write(f"{sample}\n") - print(f"File {output_file} successfully generated. Number of samples to run {len(all_sample_ids)}") - - -def list_directories(directories, output_file): - all_directories = [f for f in os.listdir(directories) if os.path.isdir(os.path.join(directories, f))] - - with open(output_file, "w") as f: - for directory in all_directories: - f.write(f"{directory}\n") - print(f"File {output_file} successfully generated. Number of directories in BAM folder {len(all_directories)}") - -def compare_files(file1, file2, report_file): - # Read file1 output - with open(file1) as f: - output1 = {line.strip() for line in f if line.strip()} - - # Read file2 output - with open(file2) as f: - output2 = {line.strip() for line in f if line.strip()} - # Compare - unique_to_file1 = output1 - output2 - unique_to_file2 = output2 - output1 - # Write results to files - with open(report_file, "w") as f: - f.write(f"Elements only in {file1}:\n") - f.write("\n".join(sorted(unique_to_file1)) + "\n\n") - - f.write(f"Elements only in {file2}:\n") - f.write("\n".join(sorted(unique_to_file2)) + "\n") - + print(f"File {output_file} successfully generated. Number of samples to run: {len(all_sample_ids)}") + + # List directories + all_directories = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))] + + # Compare outputs + samples_set = set(all_sample_ids) + directories_set = set(all_directories) + + unique_to_samples = samples_set - directories_set + unique_to_directories = directories_set - samples_set + + print(f"Unique to samples: {unique_to_samples}") + print(f"Unique to directories: {unique_to_directories}") + return { + "unique_to_samples": unique_to_samples, + "unique_to_directories": unique_to_directories + } HELP = """USAGE: -python3 parse_cohort_files.py parse [] +python3 parse_cohort_files.py parse [] - can be a single file, multiple files, or a wildcard (e.g., /path/to/files/*.txt) python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] @@ -140,14 +131,15 @@ def compare_files(file1, file2, report_file): """ if __name__ == "__main__": - if len(sys.argv) < 2: + if len(sys.argv) < 5: print(HELP) exit(1) command = sys.argv[1] if command == "parse": - input_files = sys.argv[2:-1] # all input files - output_file = sys.argv[-1] # last argument - parse_cohort_file(input_files, output_file) + input_files = sys.argv[2] + directory_path= sys.argv[3] + output_file = sys.argv[4] + parse_cohort_file(input_files, directory_path, output_file) elif command == "remove": input_file = sys.argv[2] if len(sys.argv) > 2: @@ -161,22 +153,7 @@ def compare_files(file1, file2, report_file): output_file = sys.argv[3] create_check_script(input_file, output_file) else: - create_check_script(input_file) - elif command == "list_dir": - directories = sys.argv[2] - if len(sys.argv) > 3: - output_file = sys.argv[3] - list_directories(directories, output_file) - else: - list_directories(directories) - elif command == "compare": - if len(sys.argv) < 5: # At least two files and one report file - print("Usage: python script.py compare file1 file2 report_file") - sys.exit(1) - file_1 = sys.argv[2] - file_2 = sys.argv[3] - report_file = sys.argv[4] - compare_files(file_1, file_2, report_file) + create_check_script(input_file) else: print(HELP) \ No newline at end of file From 7b0236271079672756de2eca39574d6f911b5abc Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:45:24 -0500 Subject: [PATCH 11/16] make subdirectory list optional --- scripts/parse_cohort_files.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 00ec01e..0fa2374 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -84,7 +84,7 @@ def ci_tags_to_primary_ids(samples, file_group): return primary_ids -def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): +def parse_cohort_file(input_files, output_file, directory_path=None, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): all_sample_ids = [] # Process cohort files @@ -101,8 +101,10 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d print(f"File {output_file} successfully generated. Number of samples to run: {len(all_sample_ids)}") - # List directories - all_directories = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))] + # List directories (optional) + if directory_path: + + all_directories = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))] # Compare outputs samples_set = set(all_sample_ids) @@ -121,7 +123,7 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d HELP = """USAGE: -python3 parse_cohort_files.py parse [] +python3 parse_cohort_files.py parse [] [] - can be a single file, multiple files, or a wildcard (e.g., /path/to/files/*.txt) python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] @@ -131,14 +133,17 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d """ if __name__ == "__main__": - if len(sys.argv) < 5: + if len(sys.argv) < 4: print(HELP) exit(1) command = sys.argv[1] if command == "parse": - input_files = sys.argv[2] - directory_path= sys.argv[3] - output_file = sys.argv[4] + input_files = sys.argv[2:-2] + output_file = sys.argv[-1] + # directory is optional + if len(sys.argv) > 4 and os.path.isdir(sys.argv[-2]): + directory_path = sys.argv[-2] + input_files = sys.argv[2:-2] parse_cohort_file(input_files, directory_path, output_file) elif command == "remove": input_file = sys.argv[2] From f4b0eb552fb873b1c51b651e29b32782170f441e Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:59:17 -0500 Subject: [PATCH 12/16] reverting changes --- scripts/parse_cohort_files.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 0fa2374..2c47b55 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -84,7 +84,7 @@ def ci_tags_to_primary_ids(samples, file_group): return primary_ids -def parse_cohort_file(input_files, output_file, directory_path=None, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): +def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): all_sample_ids = [] # Process cohort files @@ -101,10 +101,8 @@ def parse_cohort_file(input_files, output_file, directory_path=None, file_group= print(f"File {output_file} successfully generated. Number of samples to run: {len(all_sample_ids)}") - # List directories (optional) - if directory_path: - - all_directories = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))] + # List directories + all_directories = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))] # Compare outputs samples_set = set(all_sample_ids) @@ -123,7 +121,7 @@ def parse_cohort_file(input_files, output_file, directory_path=None, file_group= HELP = """USAGE: -python3 parse_cohort_files.py parse [] [] +python3 parse_cohort_files.py parse [] - can be a single file, multiple files, or a wildcard (e.g., /path/to/files/*.txt) python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] @@ -133,17 +131,14 @@ def parse_cohort_file(input_files, output_file, directory_path=None, file_group= """ if __name__ == "__main__": - if len(sys.argv) < 4: + if len(sys.argv) < 5: print(HELP) exit(1) command = sys.argv[1] if command == "parse": - input_files = sys.argv[2:-2] - output_file = sys.argv[-1] - # directory is optional - if len(sys.argv) > 4 and os.path.isdir(sys.argv[-2]): - directory_path = sys.argv[-2] - input_files = sys.argv[2:-2] + input_files = sys.argv[2:-2] + directory_path = sys.argv[-2] + output_file = sys.argv[-1] parse_cohort_file(input_files, directory_path, output_file) elif command == "remove": input_file = sys.argv[2] From 157486dc15a5e59b8cb1630e02a75fc6f8318a18 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:18:18 -0500 Subject: [PATCH 13/16] cmoSampleName conversion --- scripts/parse_cohort_files.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 2c47b55..5d0a205 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -63,23 +63,23 @@ def get_list_of_samples_from_cohort_file(file_path): return list(samples) -def ci_tags_to_primary_ids(samples, file_group): +def cmo_sample_name_to_primary_ids(samples, file_group): """ Args: - samples: list of ciTags + samples: list of cmoSampleName Returns: """ total_number_of_samples = len(samples) primary_ids = [] - for idx, ci_tag in enumerate(samples, start=1): - files = BEAGLE.get_files_by_metadata(f"ciTag:{ci_tag}", file_group) + for idx, cmo_sample_name in enumerate(samples, start=1): + files = BEAGLE.get_files_by_metadata(f"cmoSampleName:{cmo_sample_name}", file_group) if not files: - print(f"Unable to locate ciTag:{ci_tag}") + print(f"Unable to locate cmoSampleName:{cmo_sample_name}") continue primary_id = files[0]["metadata"]["primaryId"] - print(f"Fetching {ci_tag}:{primary_id}. Remaining {total_number_of_samples - idx}...") + print(f"Fetching {cmo_sample_name}:{primary_id}. Remaining {total_number_of_samples - idx}...") primary_ids.append(primary_id) return primary_ids @@ -109,14 +109,16 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d directories_set = set(all_directories) unique_to_samples = samples_set - directories_set - unique_to_directories = directories_set - samples_set - print(f"Unique to samples: {unique_to_samples}") - print(f"Unique to directories: {unique_to_directories}") + + missing_sample_ids = cmo_sample_name_to_primary_ids( + list(unique_to_samples), + file_group +) return { "unique_to_samples": unique_to_samples, - "unique_to_directories": unique_to_directories + "missing_sample_ids": missing_sample_ids, } From 6818c8afd3be609add5e865125756befe31b9538 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:50:18 -0500 Subject: [PATCH 14/16] remove printing unique_to_samples in console --- scripts/parse_cohort_files.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 5d0a205..59fffb2 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -89,16 +89,14 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d # Process cohort files for input_file in input_files: - # Parse cohort file samples = get_list_of_samples_from_cohort_file(input_file) - # Convert from ciTags to primaryIds if needed - # primary_ids = ci_tags_to_primary_ids(samples, file_group) all_sample_ids.extend(samples) - + + # Write all parsed sample names to output file with open(output_file, "w") as f: for sample in all_sample_ids: f.write(f"{sample}\n") - + print(f"File {output_file} successfully generated. Number of samples to run: {len(all_sample_ids)}") # List directories @@ -109,16 +107,27 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d directories_set = set(all_directories) unique_to_samples = samples_set - directories_set - print(f"Unique to samples: {unique_to_samples}") + #print(f"Unique to samples: {unique_to_samples}") + + + # Convert missing sample names to primaryIds + primary_ids = cmo_sample_name_to_primary_ids( + list(unique_to_samples), + file_group + ) + + # Append missing primaryIds to the same output file + with open(output_file, "a") as f: + f.write("\n# Missing primaryIds\n") + for primary_id in primary_ids: + f.write(f"{primary_id}\n") - missing_sample_ids = cmo_sample_name_to_primary_ids( - list(unique_to_samples), - file_group -) + print(f"Appended {len(primary_ids)} missing primaryIds to {output_file}") + # Return both sets for downstream use if needed return { "unique_to_samples": unique_to_samples, - "missing_sample_ids": missing_sample_ids, + "primary_ids": primary_ids, } From 1a0c4d33e5689bf535bf853d0bcc253506ee420d Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:29:45 -0500 Subject: [PATCH 15/16] write differences to seperate outputfile --- scripts/parse_cohort_files.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index 59fffb2..ebdacdd 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -84,7 +84,7 @@ def cmo_sample_name_to_primary_ids(samples, file_group): return primary_ids -def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): +def parse_cohort_file(input_files, directory_path, output_file, diff_output_file, file_group="b54d035d-f63c-4ea8-86fb-9dbc976bb7fe"): all_sample_ids = [] # Process cohort files @@ -116,13 +116,12 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d file_group ) - # Append missing primaryIds to the same output file - with open(output_file, "a") as f: - f.write("\n# Missing primaryIds\n") + # Write missing primaryIds to the different output file + with open(diff_output_file, "w") as f: for primary_id in primary_ids: f.write(f"{primary_id}\n") - print(f"Appended {len(primary_ids)} missing primaryIds to {output_file}") + print(f"File {diff_output_file} successfully generated. Number of samples missing: {len(unique_to_samples)}") # Return both sets for downstream use if needed return { @@ -132,8 +131,9 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d HELP = """USAGE: -python3 parse_cohort_files.py parse [] +python3 parse_cohort_files.py parse [] - can be a single file, multiple files, or a wildcard (e.g., /path/to/files/*.txt) + - is the path containing existing directories to compare python3 parse_cohort_files.py remove [] python3 parse_cohort_files.py check [] python3 parse_cohort_files.py list_dir [] @@ -142,15 +142,16 @@ def parse_cohort_file(input_files, directory_path, output_file, file_group="b54d """ if __name__ == "__main__": - if len(sys.argv) < 5: + if len(sys.argv) < 6: print(HELP) exit(1) command = sys.argv[1] if command == "parse": - input_files = sys.argv[2:-2] - directory_path = sys.argv[-2] - output_file = sys.argv[-1] - parse_cohort_file(input_files, directory_path, output_file) + input_files = sys.argv[2:-3] + directory_path = sys.argv[-3] + output_file = sys.argv[-2] + diff_output_file = sys.argv[-1] + parse_cohort_file(input_files, directory_path, output_file, diff_output_file) elif command == "remove": input_file = sys.argv[2] if len(sys.argv) > 2: From 21b75c005529c3e0ca20532df6836157066156b8 Mon Sep 17 00:00:00 2001 From: D-Pankey <30415217+D-Pankey@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:04:20 -0500 Subject: [PATCH 16/16] remove comment --- scripts/parse_cohort_files.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/parse_cohort_files.py b/scripts/parse_cohort_files.py index ebdacdd..832dcec 100644 --- a/scripts/parse_cohort_files.py +++ b/scripts/parse_cohort_files.py @@ -107,9 +107,8 @@ def parse_cohort_file(input_files, directory_path, output_file, diff_output_file directories_set = set(all_directories) unique_to_samples = samples_set - directories_set - #print(f"Unique to samples: {unique_to_samples}") - + # Convert missing sample names to primaryIds primary_ids = cmo_sample_name_to_primary_ids( list(unique_to_samples),