-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_data.py
More file actions
executable file
·34 lines (23 loc) · 886 Bytes
/
debug_data.py
File metadata and controls
executable file
·34 lines (23 loc) · 886 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""Creates a smaller data set from the real data set for testing and debugging"""
from sys import argv
from pandas import DataFrame, read_csv
from pickle import load
from random import seed, shuffle
from utils import COL_TYPES_PICKLE_PATH
def debug_data():
"""See module description"""
data_path: str = argv[1]
n_cols: int = int(argv[2])
# Ensure the same columns are selected each time this script is run
seed(0)
with open(COL_TYPES_PICKLE_PATH, 'rb') as f:
col_types: dict = load(f)
# Sort the headers for consistency
headers: list = sorted(col_types.keys())
shuffle(headers)
headers: list = headers[0:n_cols]
# Make a subset of the data and column types
data: DataFrame = read_csv(data_path, usecols=['PTID'] + headers)
data.to_csv('data/debug-data.csv', index=False)
if __name__ == '__main__':
debug_data()