From 2cfc1ba4ea7f9cb2bd5c7e8707f8baa8f2c081e4 Mon Sep 17 00:00:00 2001 From: swinersha Date: Wed, 31 Jul 2024 13:44:46 +0000 Subject: [PATCH 1/6] feat: k_all pixels and interactive find pairs drafted --- .gitignore | 2 +- methods/matching/calculate_k.py | 29 +- .../cluster_find_pairs_interactive.py | 581 ++++++++++++++++++ 3 files changed, 599 insertions(+), 13 deletions(-) create mode 100644 methods/matching/cluster_find_pairs_interactive.py diff --git a/.gitignore b/.gitignore index eac8079..123285e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .DS_Store *.npy data - +Figures # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py index dfecd05..dba2284 100644 --- a/methods/matching/calculate_k.py +++ b/methods/matching/calculate_k.py @@ -21,10 +21,13 @@ LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE = 0.05 # The '2 *' in this is because I'm just considering one axis, rather than area -PIXEL_SKIP_SMALL_PROJECT = \ - round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) -PIXEL_SKIP_LARGE_PROJECT = \ - round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) +# PIXEL_SKIP_SMALL_PROJECT = \ +# round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) +# PIXEL_SKIP_LARGE_PROJECT = \ +# round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS) + +PIXEL_SKIP_SMALL_PROJECT = 1 +PIXEL_SKIP_LARGE_PROJECT = 1 MatchingCollection = namedtuple('MatchingCollection', ['boundary', 'lucs', 'cpcs', 'ecoregions', 'elevation', 'slope', 'access', 'countries']) @@ -66,10 +69,11 @@ def build_layer_collection( # ecoregions is such a heavy layer it pays to just rasterize it once - we should possibly do this once # as part of import of the ecoregions data - ecoregions = GroupLayer([ - RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in - glob.glob("*.tif", root_dir=ecoregions_directory_path) - ], name="ecoregions") + # ecoregions = GroupLayer([ + # RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in + # glob.glob("*.tif", root_dir=ecoregions_directory_path) + # ], name="ecoregions") + ecoregions = RasterLayer.layer_from_file(ecoregions_directory_path) elevation = GroupLayer([ RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in @@ -80,10 +84,11 @@ def build_layer_collection( glob.glob("slope*.tif", root_dir=slope_directory_path) ], name="slopes") - access = GroupLayer([ - RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in - glob.glob("*.tif", root_dir=access_directory_path) - ], name="access") + # access = GroupLayer([ + # RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in + # glob.glob("*.tif", root_dir=access_directory_path) + # ], name="access") + access = RasterLayer.layer_from_file(access_directory_path) countries = RasterLayer.layer_from_file(countries_raster_filename) diff --git a/methods/matching/cluster_find_pairs_interactive.py b/methods/matching/cluster_find_pairs_interactive.py new file mode 100644 index 0000000..9ac25fb --- /dev/null +++ b/methods/matching/cluster_find_pairs_interactive.py @@ -0,0 +1,581 @@ +# source myenv/bin/activate + +import pandas as pd +import numpy as np +from numba import njit +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import geopandas as gpd +import os +import time +import sys +import faiss + +# Read K and M +# Repeat 100 times - Ultimately we would like to remove this step +# PCA across K and M +# Divide into clusters - Check how much splitting into clusters speeds things up +# Sample K to find K_sub +# Sample M to find M_sub +# Split M into LUC combinations - How big is each set? +# For each cluster in K_sub +# Split cluster in K_sub into categorical combinations - How big is each set? +# For each categorical combination +# Sample from the equivalent cluster and categorical combindation in M_sub +# Find pairs for the categorical combinations in K_sub from the categorical combinations in M_sub +# RowBind categorical combination sets +# RowBind cluster sets +# Save Pairs + + +# NOTES +# 1. We might need to combine some categorical subsets because at least for comparing validity of the matches +# because if the subsets are too small the standardised mean differences can be very wrong +# 2. One option would be to combine clusters based on the proximity of the cluster centres. For the LUCs, we might +# combine groups that are in the same state when the project begins, even if the LUC history is not the same. +# 3. There is a question of how much supposed additionality is created by each categorical subset? If it is +# nothing, it might not matter. If it is substantive then it definitely does matter. + + + +# this function uses loops instead of numpy vector operations +@njit +def loop_match(m_pca, k_pca): + picked = np.ones((m_pca.shape[0],), dtype=np.bool_) + fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32) + for i in range(0, k_pca.shape[0]): + min_squared_diff_sum = np.inf + min_squared_diff_j = -1 + for j in range(0, m_pca.shape[0]): + if picked[j]: + squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2) + if squared_diff_sum < min_squared_diff_sum: + min_squared_diff_sum = squared_diff_sum + min_squared_diff_j = j + fast_matches[i] = min_squared_diff_j + picked[min_squared_diff_j] = False + return fast_matches + +### Now try with real numbers + +def to_int32(x): + # Normalize the data to the range 0 to 1 + min_val = np.min(x) + max_val = np.max(x) + normalized_data = (x - min_val) / (max_val - min_val) + + # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers + scaled_data = normalized_data * 255 + + # Convert to 32-bit integers (0 to 255) + int32_data = scaled_data.astype(np.int32) + + return int32_data + +def to_pca_int32(x): + # Perform PCA and convert to dataframe + pca = PCA(n_components=min(len(x), len(x.columns)), + whiten=False) # Centering and scaling done by default + pca_result = pca.fit_transform(x) + pca_df = pd.DataFrame(pca_result) + # Convert all columns to int8 + pca_32 = pca_df.apply(to_int32) + return pca_32 + + +def calculate_smd(group1, group2): + # Means + mean1, mean2 = np.mean(group1), np.mean(group2) + # Standard deviations + std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1) + # Sample sizes + n1, n2 = len(group1), len(group2) + # Pooled standard deviation + pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)) + # Standardized mean difference + smd = (mean1 - mean2) / pooled_std + return smd, mean1, mean2, pooled_std + + +K_SUB_PROPORTION = 0.01 +M_SUB_PROPORTION = 0.1 +# Number of clusters +N_CLUSTERS = 9 +# Number of iterations for K means fitting +N_ITERATIONS = 100 +VERBOSE = True + +# Define the start year +t0 = 2012 # READ THIS IN +match_years = [t0-10, t0-5, t0] + +# Read in the data +boundary = gpd.read_file('/maps/aew85/projects/1201.geojson') + +k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet') +m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet') + +# concat m and k +km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) + +# Select columns (excluding 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt', and those starting with 'luc') +exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt'] +exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')] + +match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] + +# Extract only the continuous columns +continuous_columns = km_pixels.columns.difference(exclude_columns) +km_pixels_selected = km_pixels[continuous_columns] + +# PCA transform and conversion to 32 bit ints +km_pca = to_pca_int32(km_pixels_selected) + +# Looks good +km_pca.head() + +#------------------------------------------ + +# Initialize the KMeans object +kmeans = faiss.Kmeans(d=km_pca.shape[1], k=N_CLUSTERS, niter=N_ITERATIONS, verbose=True) +# Perform clustering +kmeans.train(km_pca) + +# Get cluster assignments +km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() + +# Frequency distribution in each cluster +cluster_counts = pd.Series(km_pixels['cluster']).value_counts() +if VERBOSE: + print("Cluster counts:\n", cluster_counts) + + +# Convert to spatial (simple features) +km_pixels_sf = gpd.GeoDataFrame( + km_pixels, + geometry=gpd.points_from_xy(km_pixels['lng'], km_pixels['lat']), + crs="EPSG:4326" +) + + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Plot cluster centres + +# Get cluster centers +centroids = kmeans.centroids + +if VERBOSE: + # Plot the cluster centers + plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=100, marker='x', label='Cluster centers') + # Add cluster IDs as labels on the plot + for i, center in enumerate(centroids): + plt.text(center[0], center[1], str(i), color='red', fontsize=12, weight='bold') + + plt.title('K-means Clustering with Faiss') + plt.xlabel('PCA Component 1') + plt.ylabel('PCA Component 2') + plt.legend() + plt.show() + plt.savefig('Figures/cluster_centres_faiss_1.png') + plt.close() # Close the plot to free up memory + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Plot clusters as separate facets +if VERBOSE: + fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15)) + axes = axes.flatten() + + clusters = sorted(km_pixels_sf['cluster'].unique()) + + for i, cluster in enumerate(clusters): + ax = axes[i] + cluster_data = km_pixels_sf[km_pixels_sf['cluster'] == cluster] + cluster_data.plot(ax=ax, color='blue', markersize=0.2) + boundary.plot(ax=ax, edgecolor='black', facecolor='none') + ax.set_title(f'Cluster {cluster}') + ax.set_axis_off() + + # Turn off any unused subplots + for j in range(len(clusters), len(axes)): + fig.delaxes(axes[j]) + + plt.tight_layout() + plt.savefig('Figures/cluster_faiss_1_facet.png') + plt.close() + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Extract K and M pixels +k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] +m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] + +# Extract K and M PCA transforms +k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() +m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() + +k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) +m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) + +# Define indexs for the samples from K and M +k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) +m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + +# Take random samples from K and M pixels +k_sub = k_pixels.iloc[k_random_indices] +m_sub = m_pixels.iloc[m_random_indices] + +# Take corresponding random samples from the PCA transformed K and M +k_sub_pca = k_pca[k_random_indices,:] +m_sub_pca = m_pca[m_random_indices,:] + +if VERBOSE: + # Handy code for displaying the number of counts in each unique category combination + # In K + k_combination_counts = k_sub.groupby(match_cats).size().reset_index(name='counts') + print("k_combination_counts") + print(k_combination_counts) + # In M + m_combination_counts = m_sub.groupby(match_cats).size().reset_index(name='counts') + print("m_combination_counts") + print(m_combination_counts) + + +# Identify the unique combinations of luc columns +k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) + +pairs_list = [] + +start_time = time.time() +for i in range(0, k_cat_combinations.shape[0]): + # i = 6 # ith element of the unique combinations of the luc time series in k + # for in range() + k_cat_comb = k_cat_combinations.iloc[i] + k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + + # Find the subset in km_pixels that matches this combination + m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + + if VERBOSE: + print('ksub_cat:' + str(k_cat.shape[0])) + print('msub_cat:' + str(m_cat.shape[0])) + + # If there is no suitable match for the pre-project luc time series + # Then it may be preferable to just take the luc state at t0 + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] + # For if there are no matches return nothing + + if(m_cat.shape[0] < k_cat.shape[0] * 5): + print("M insufficient for matching. Set VERBOSE to True for more details.") + continue + + matches_index = loop_match(m_cat_pca, k_cat_pca) + m_cat_matches = m_cat.iloc[matches_index] + + # i = 0 + # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True) + # matched.columns = ['trt', 'ctrl'] + # matched + #Looks great! + columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] + # Calculate SMDs for the specified columns + smd_results = [] + for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + + # Convert the results to a DataFrame for better readability + smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) + + if VERBOSE: + # Print the results + print("categorical combination:") + print(k_cat_comb) + # Count how many items in 'column1' are not equal to the specified integer value + print("LUC flips in K:") + (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("LUC flips in matches:") + (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("Standardized Mean Differences:") + print(smd_df) + + # Join the pairs into one dataframe: + k_cat = k_cat.reset_index(drop = True) + m_cat_matches = m_cat_matches.reset_index(drop = True) + pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) + + # Append the resulting DataFrame to the list + pairs_list.append(pairs_df) + +# Combine all the DataFrames in the list into a single DataFrame +combined_pairs = pd.concat(pairs_list, ignore_index=True) + +end_time = time.time() +elapsed_time = end_time - start_time +if VERBOSE: + print(f"Elapsed time: {elapsed_time:.2f} seconds") + +columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] +# Calculate SMDs for the specified columns +smd_results = [] +for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(combined_pairs['k_' + column], combined_pairs['s_' + column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + +# Convert the results to a DataFrame for better readability +smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) +print(smd_df) + +smd_results = [] +for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(k_pixels[column], m_pixels[column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + +# Convert the results to a DataFrame for better readability +smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) +print(smd_df) + + + + + + + +def find_match_iteration( + k_parquet_filename: str, + m_parquet_filename: str, + start_year: int, + output_folder: str, + idx_and_seed: tuple[int, int] +) -> None: + logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING) + rng = np.random.default_rng(idx_and_seed[1]) + + logging.info("Loading K from %s", k_parquet_filename) + k_pixels = pd.read_parquet(k_parquet_filename) + + logging.info("Loading M from %s", m_parquet_filename) + m_pixels = pd.read_parquet(m_parquet_filename) + + # concat m and k + km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) + + # Find the continuous columns + exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt'] + exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')] + continuous_columns = km_pixels.columns.difference(exclude_columns) + # Categorical columns + match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] + + logging.info("Starting PCA transformation of k and m union. km_pixels.shape: %a", {km_pixels.shape}) + # PCA transform and conversion to 32 bit ints for continuous only + km_pca = to_pca_int32(km_pixels[continuous_columns]) + logging.info("Done PCA transformation") + + # Extract K and M pixels - this might be unnecessary I just wanted to make sure + # K and M were in the same order here and in the PCA transform + k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] + m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] + # Extract K and M PCA transforms + k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() + m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() + + # Sample from K and M + k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) + m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) + # Define indexs for the samples from K and M + k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + # Take random samples from K and M pixels + k_sub = k_pixels.iloc[k_random_indices] + m_sub = m_pixels.iloc[m_random_indices] + # Take corresponding random samples from the PCA transformed K and M + k_sub_pca = k_pca[k_random_indices,:] + m_sub_pca = m_pca[m_random_indices,:] + + logging.info("Samples taken from K and M. k_sub.shape: %a; m_sub.shape: %a", {k_sub.shape, m_sub.shape}) + + # Identify the unique combinations of luc columns + k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) + + pairs_list = [] + matchless_list = [] + + logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s", + k_sub.shape, m_sub.shape) + + start_time = time.time() + for i in range(0, k_cat_combinations.shape[0]): + # i = 6 # ith element of the unique combinations of the luc time series in k + # for in range() + k_cat_comb = k_cat_combinations.iloc[i] + k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)] + + # Find the subset in km_pixels that matches this combination + m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)] + + if VERBOSE: + print('ksub_cat:' + str(k_cat.shape[0])) + print('msub_cat:' + str(m_cat.shape[0])) + + # If there is no suitable match for the pre-project luc time series + # Then it may be preferable to just take the luc state at t0 + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] + # For if there are no matches return nothing + + if(m_cat.shape[0] < k_cat.shape[0] * 5): + # print("M insufficient for matching. Set VERBOSE to True for more details.") + # Append the matchless DataFrame to the list + matchless_list.append(k_cat) + continue + + # Find the matches + matches_index = loop_match(m_cat_pca, k_cat_pca) + m_cat_matches = m_cat.iloc[matches_index] + + # i = 0 + # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True) + # matched.columns = ['trt', 'ctrl'] + # matched + #Looks great! + columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] + # Calculate SMDs for the specified columns + smd_results = [] + for column in columns_to_compare: + smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column]) + smd_results.append((column, smd, mean1, mean2, pooled_std)) + + # Convert the results to a DataFrame for better readability + smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) + + if VERBOSE: + # Print the results + print("categorical combination:") + print(k_cat_comb) + # Count how many items in 'column1' are not equal to the specified integer value + print("LUC flips in K:") + (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("LUC flips in matches:") + (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() + print("Standardized Mean Differences:") + print(smd_df) + + # Join the pairs into one dataframe: + k_cat = k_cat.reset_index(drop = True) + m_cat_matches = m_cat_matches.reset_index(drop = True) + pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) + + # Append the resulting DataFrame to the list + pairs_list.append(pairs_df) + + # Combine all the DataFrames in the list into a single DataFrame + pairs = pd.concat(pairs_list, ignore_index=True) + matchless = pd.concat(matchless_list, ignore_index=True) + + logging.info("Finished greedy matching... pairs.shape: %s, matchless.shape: %s", + pairs.shape, matchless.shape) + + logging.info("Starting storing matches...") + pairs.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) + matchless.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) + + logging.info("Finished find match iteration") + + +def find_pairs( + k_parquet_filename: str, + m_parquet_filename: str, + start_year: int, + seed: int, + output_folder: str, + processes_count: int +) -> None: + logging.info("Starting find pairs") + os.makedirs(output_folder, exist_ok=True) + + rng = np.random.default_rng(seed) + iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING)) + + with Pool(processes=processes_count) as pool: + pool.map( + partial( + find_match_iteration, + k_parquet_filename, + m_parquet_filename, + start_year, + output_folder + ), + iteration_seeds + ) + + +def main(): + # If you use the default multiprocess model then you risk deadlocks when logging (which we + # have hit). Spawn is the default on macOS, but not on Linux. + set_start_method("spawn") + + parser = argparse.ArgumentParser(description="Takes K and M and finds 100 sets of matches.") + parser.add_argument( + "--k", + type=str, + required=True, + dest="k_filename", + help="Parquet file containing pixels from K as generated by calculate_k.py" + ) + parser.add_argument( + "--m", + type=str, + required=True, + dest="m_filename", + help="Parquet file containing pixels from M as generated by build_m_table.py" + ) + parser.add_argument( + "--start_year", + type=int, + required=True, + dest="start_year", + help="Year project started." + ) + parser.add_argument( + "--seed", + type=int, + required=True, + dest="seed", + help="Random number seed, to ensure experiments are repeatable." + ) + parser.add_argument( + "--output", + type=str, + required=True, + dest="output_directory_path", + help="Directory into which output matches will be written. Will be created if it does not exist." + ) + parser.add_argument( + "-j", + type=int, + required=False, + default=round(cpu_count() / 2), + dest="processes_count", + help="Number of concurrent threads to use." + ) + args = parser.parse_args() + + find_pairs( + args.k_filename, + args.m_filename, + args.start_year, + args.seed, + args.output_directory_path, + args.processes_count + ) + +if __name__ == "__main__": + main() \ No newline at end of file From 088048eb0907685ec24ba6afc20cffe7ee71e2c7 Mon Sep 17 00:00:00 2001 From: swinersha Date: Wed, 31 Jul 2024 18:54:55 +0000 Subject: [PATCH 2/6] feat: fast find pairs experiment incorporated into executable --- methods/matching/find_pairs.py | 469 +++++++++++++++------------------ 1 file changed, 209 insertions(+), 260 deletions(-) diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py index 7f17782..62e9e9c 100644 --- a/methods/matching/find_pairs.py +++ b/methods/matching/find_pairs.py @@ -3,16 +3,35 @@ import logging from functools import partial from multiprocessing import Pool, cpu_count, set_start_method -from numba import jit # type: ignore +from numba import njit # type: ignore import numpy as np import pandas as pd +from sklearn.decomposition import PCA +import faiss from methods.common.luc import luc_matching_columns +# TO DO: +# 1. Rename columns to luc10, luc5 and luc0 to align with the pipeline + +# to delete: +# start_year = 2012 +# k_parquet_filename = '/maps/tws36/tmf_pipe_out/1201/k_all.parquet' +# m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/tom_pairs/matches.parquet' +# luc_match = True + REPEAT_MATCH_FINDING = 100 -DEFAULT_DISTANCE = 10000000.0 DEBUG = False +K_SUB_PROPORTION = 0.01 +M_SUB_PROPORTION = 0.1 +# Number of clusters +NUM_CLUSTERS = 9 +# Number of iterations for K means fitting +NUM_ITERATIONS = 100 +RELATIVE_MATCH_YEARS = [-10, -5, 0] + + DISTANCE_COLUMNS = [ "elevation", "slope", "access", "cpc0_u", "cpc0_d", @@ -23,297 +42,219 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +# this function uses loops instead of numpy vector operations +@njit +def loop_match(m_pca, k_pca): + picked = np.ones((m_pca.shape[0],), dtype=np.bool_) + fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32) + for i in range(0, k_pca.shape[0]): + min_squared_diff_sum = np.inf + min_squared_diff_j = -1 + for j in range(0, m_pca.shape[0]): + if picked[j]: + squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2) + if squared_diff_sum < min_squared_diff_sum: + min_squared_diff_sum = squared_diff_sum + min_squared_diff_j = j + fast_matches[i] = min_squared_diff_j + picked[min_squared_diff_j] = False + return fast_matches + +### Now try with real numbers + +def to_int32(x): + # Normalize the data to the range 0 to 1 + min_val = np.min(x) + max_val = np.max(x) + normalized_data = (x - min_val) / (max_val - min_val) + # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers + scaled_data = normalized_data * 255 + # Convert to 32-bit integers (0 to 255) + int32_data = scaled_data.astype(np.int32) + return int32_data + +def to_pca_int32(x): + # Perform PCA and convert to dataframe + pca = PCA(n_components=min(len(x), len(x.columns)), + whiten=False) # Centering and scaling done by default + pca_result = pca.fit_transform(x) + pca_df = pd.DataFrame(pca_result) + # Convert all columns to int8 + pca_32 = pca_df.apply(to_int32) + return pca_32 + + +def calculate_smd(group1, group2): + # Means + mean1, mean2 = np.mean(group1), np.mean(group2) + # Standard deviations + std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1) + # Sample sizes + n1, n2 = len(group1), len(group2) + # Pooled standard deviation + pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)) + # Standardized mean difference + smd = (mean1 - mean2) / pooled_std + return smd, mean1, mean2, pooled_std + def find_match_iteration( - k_parquet_filename: str, - m_parquet_filename: str, + km_pixels: pd.DataFrame, + km_pca: np.ndarray, start_year: int, + luc_match: bool, output_folder: str, idx_and_seed: tuple[int, int] ) -> None: logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING) rng = np.random.default_rng(idx_and_seed[1]) - - logging.info("Loading K from %s", k_parquet_filename) - - # Methodology 6.5.7: For a 10% sample of K - k_set = pd.read_parquet(k_parquet_filename) - k_subset = k_set.sample( - frac=0.1, - random_state=rng - ).reset_index() - - logging.info("Loading M from %s", m_parquet_filename) - m_set = pd.read_parquet(m_parquet_filename) - - # get the column ids for DISTANCE_COLUMNS - thresholds_for_columns = np.array([ - 200.0, # Elev - 2.5, # Slope - 10.0, # Access - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - 0.1, # CPCs - ]) - - logging.info("Preparing s_set...") - - m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns - k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns - - # convert to float32 numpy arrays and make them contiguous for numba to vectorise - m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32) - k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32) - - # LUC columns are all named with the year in, so calculate the column names - # for the years we are intested in - luc0, luc5, luc10 = luc_matching_columns(start_year) - # As well as all the LUC columns for later use - luc_columns = [x for x in m_set.columns if x.startswith('luc')] - - hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0] - assert len(hard_match_columns) == HARD_COLUMN_COUNT - - # similar to the above, make the hard match columns contiguous float32 numpy arrays - m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32) - k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32) - - # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every - # pixel in the subsample (which is 10% the size of K) we select 100 pixels. - required = 100 - - logging.info("Running make_s_set_mask... required: %d", required) - starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0])) - s_set_mask_true, no_potentials = make_s_set_mask( - m_dist_thresholded, - k_subset_dist_thresholded, - m_dist_hard, - k_subset_dist_hard, - starting_positions, - required + + match_years = [start_year + year for year in RELATIVE_MATCH_YEARS] + # The categorical columns: + if luc_match: + match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] + else: + match_cats = ["ecoregion", "country", "cluster"] + + # Extract K and M pixels + k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] + m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] + # Extract K and M PCA transforms + k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() + m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() + # Draw subsamples + # Methodology 6.5.7: Needs to be updated !!! + k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) + m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) + # Define indexs for the samples from K and M + k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + # Take random samples from K and M pixels + k_sub = k_pixels.iloc[k_random_indices] + m_sub = m_pixels.iloc[m_random_indices] + # Take corresponding random samples from the PCA transformed K and M + k_sub_pca = k_pca[k_random_indices,:] + m_sub_pca = m_pca[m_random_indices,:] + + logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s", + k_sub.shape, m_sub.shape) + + pairs, matchless = greedy_match( + k_sub, + m_sub, + k_sub_pca, + m_sub_pca, + match_cats ) - - logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape}) - - s_set = m_set[s_set_mask_true] - potentials = np.invert(no_potentials) - - k_subset = k_subset[potentials] - logging.info("Finished preparing s_set. shape: %a", {s_set.shape}) - - # Notes: - # 1. Not all pixels may have matches - results = [] - matchless = [] - - s_set_for_cov = s_set[DISTANCE_COLUMNS] - logging.info("Calculating covariance...") - covarience = np.cov(s_set_for_cov, rowvar=False) - logging.info("Calculating inverse covariance...") - invconv = np.linalg.inv(covarience).astype(np.float32) - - # Match columns are luc10, luc5, luc0, "country" and "ecoregion" - s_set_match = s_set[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32) - # this is required so numba can vectorise the loop in greedy_match - s_set_match = np.ascontiguousarray(s_set_match) - - # Now we do the same thing for k_subset - k_subset_match = k_subset[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32) - # this is required so numba can vectorise the loop in greedy_match - k_subset_match = np.ascontiguousarray(k_subset_match) - - logging.info("Starting greedy matching... k_subset_match.shape: %s, s_set_match.shape: %s", - k_subset_match.shape, s_set_match.shape) - - add_results, k_idx_matchless = greedy_match( - k_subset_match, - s_set_match, - invconv - ) - + + # Combine all the pairs DataFrames in the list into a single DataFrame + combined_pairs = pd.concat(pairs, ignore_index=True) + # Combine all the matchess DataFrames in the list into a single DataFrame + combined_matchless = pd.concat(matchless, ignore_index=True) logging.info("Finished greedy matching...") - + logging.info("Starting storing matches...") - - for result in add_results: - (k_idx, s_idx) = result - k_row = k_subset.iloc[k_idx] - match = s_set.iloc[s_idx] - - if DEBUG: - for hard_match_column in hard_match_columns: - if k_row[hard_match_column] != match[hard_match_column]: - print(k_row) - print(match) - raise ValueError("Hard match inconsistency") - - results.append( - [k_row.lat, k_row.lng] + [k_row[x] for x in luc_columns + DISTANCE_COLUMNS] + \ - [match.lat, match.lng] + [match[x] for x in luc_columns + DISTANCE_COLUMNS] - ) - - logging.info("Finished storing matches...") - - for k_idx in k_idx_matchless: - k_row = k_subset.iloc[k_idx] - matchless.append(k_row) - - columns = ['k_lat', 'k_lng'] + \ - [f'k_{x}' for x in luc_columns + DISTANCE_COLUMNS] + \ - ['s_lat', 's_lng'] + \ - [f's_{x}' for x in luc_columns + DISTANCE_COLUMNS] - - results_df = pd.DataFrame(results, columns=columns) - results_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) - - matchless_df = pd.DataFrame(matchless, columns=k_set.columns) - matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) - + combined_pairs_df = pd.DataFrame(combined_pairs) + combined_pairs_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) + + combined_matchless_df = pd.DataFrame(combined_matchless) + combined_matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) + logging.info("Finished find match iteration") -@jit(nopython=True, fastmath=True, error_model="numpy") -def make_s_set_mask( - m_dist_thresholded: np.ndarray, - k_subset_dist_thresholded: np.ndarray, - m_dist_hard: np.ndarray, - k_subset_dist_hard: np.ndarray, - starting_positions: np.ndarray, - required: int -): - m_size = m_dist_thresholded.shape[0] - k_size = k_subset_dist_thresholded.shape[0] - - s_include = np.zeros(m_size, dtype=np.bool_) - k_miss = np.zeros(k_size, dtype=np.bool_) - - for k in range(k_size): - matches = 0 - k_row = k_subset_dist_thresholded[k, :] - k_hard = k_subset_dist_hard[k] - - for index in range(m_size): - m_index = (index + starting_positions[k]) % m_size - - m_row = m_dist_thresholded[m_index, :] - m_hard = m_dist_hard[m_index] - - should_include = True - - # check that every element of m_hard matches k_hard - hard_equals = True - for j in range(m_hard.shape[0]): - if m_hard[j] != k_hard[j]: - hard_equals = False - - if not hard_equals: - should_include = False - else: - for j in range(m_row.shape[0]): - if abs(m_row[j] - k_row[j]) > 1.0: - should_include = False - - if should_include: - s_include[m_index] = True - matches += 1 - - # Don't find any more M's - if matches == required: - break - - k_miss[k] = matches == 0 - - return s_include, k_miss - -# Function which returns a boolean array indicating whether all values in a row are true -@jit(nopython=True, fastmath=True, error_model="numpy") -def rows_all_true(rows: np.ndarray): - # Don't use np.all because not supported by numba - - # Create an array of booleans for rows in s still available - all_true = np.ones((rows.shape[0],), dtype=np.bool_) - for row_idx in range(rows.shape[0]): - for col_idx in range(rows.shape[1]): - if not rows[row_idx, col_idx]: - all_true[row_idx] = False - break - - return all_true - - -@jit(nopython=True, fastmath=True, error_model="numpy") def greedy_match( - k_subset: np.ndarray, - s_set: np.ndarray, - invcov: np.ndarray + k_sub: pd.DataFrame, + m_sub: pd.DataFrame, + k_sub_pca: np.ndarray, + m_sub_pca: np.ndarray, + match_cats: list ): - # Create an array of booleans for rows in s still available - s_available = np.ones((s_set.shape[0],), dtype=np.bool_) - total_available = s_set.shape[0] - - results = [] + # Identify the unique combinations of categorical columns + k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) + + # Not all pixels may have matches + pairs = [] matchless = [] - - s_tmp = np.zeros((s_set.shape[0],), dtype=np.float32) - - for k_idx in range(k_subset.shape[0]): - k_row = k_subset[k_idx, :] - - hard_matches = rows_all_true(s_set[:, :HARD_COLUMN_COUNT] == k_row[:HARD_COLUMN_COUNT]) & s_available - hard_matches = hard_matches.reshape( - -1, - ) - - if total_available > 0: - # Now calculate the distance between the k_row and all the hard matches we haven't already matched - s_tmp[hard_matches] = batch_mahalanobis_squared( - s_set[hard_matches, HARD_COLUMN_COUNT:], k_row[HARD_COLUMN_COUNT:], invcov - ) - # Find the index of the minimum distance in s_tmp[hard_matches] but map it back to the index in s_set - if np.any(hard_matches): - min_dist_idx = np.argmin(s_tmp[hard_matches]) - min_dist_idx = np.arange(s_tmp.shape[0])[hard_matches][min_dist_idx] - - results.append((k_idx, min_dist_idx)) - s_available[min_dist_idx] = False - total_available -= 1 - else: - matchless.append(k_idx) - - return (results, matchless) - -# optimised batch implementation of mahalanobis distance that returns a distance per row -@jit(nopython=True, fastmath=True, error_model="numpy") -def batch_mahalanobis_squared(rows, vector, invcov): - # calculate the difference between the vector and each row (broadcasted) - diff = rows - vector - # calculate the distance for each row in one batch - dists = (np.dot(diff, invcov) * diff).sum(axis=1) - return dists - + + for i in range(0, k_cat_combinations.shape[0]): + # i = 6 # ith element of the unique combinations of the luc time series in k + # for in range() + k_cat_comb = k_cat_combinations.iloc[i] + k_cat_index = k_sub[match_cats] == k_cat_comb + k_cat = k_sub[(k_cat_index).all(axis=1)] + k_cat_pca = k_sub_pca[(k_cat_index).all(axis=1)] + + # Find the subset in km_pixels that matches this combination + m_cat_index = m_sub[match_cats] == k_cat_comb + m_cat = m_sub[(m_cat_index).all(axis=1)] + m_cat_pca = m_sub_pca[(m_cat_index).all(axis=1)] + + # If there is no suitable match for the pre-project luc time series + # Then it may be preferable to just take the luc state at t0 + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] + # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] + # For now if there are no matches return nothing + + if(m_cat.shape[0] < k_cat.shape[0] * 5): + matchless.append(k_cat) + continue + + matches_index = loop_match(m_cat_pca, k_cat_pca) + m_cat_matches = m_cat.iloc[matches_index] + + # Join the pairs into one dataframe: + k_cat = k_cat.reset_index(drop = True) + m_cat_matches = m_cat_matches.reset_index(drop = True) + pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) + # Append the resulting DataFrame to the list + pairs.append(pairs_df) + + return (pairs, matchless) def find_pairs( k_parquet_filename: str, m_parquet_filename: str, start_year: int, + luc_match: bool, seed: int, output_folder: str, processes_count: int ) -> None: + logging.info("Loading K from %s", k_parquet_filename) + k_pixels = pd.read_parquet(k_parquet_filename) + logging.info("Loading M from %s", m_parquet_filename) + m_pixels = pd.read_parquet(m_parquet_filename) + # concat m and k + km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) + + # Extract only the continuous columns + km_pixels_distance = km_pixels[DISTANCE_COLUMNS] + # PCA transform and conversion to 32 bit ints + logging.info("Transforming continuous variables to PCA space") + km_pca = to_pca_int32(km_pixels_distance) + # Find clusters using Kmeans + logging.info("Starting cluster assignment using kmeans") + # Initialize the KMeans object + kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True) + # Perform clustering + kmeans.train(km_pca) + # Get cluster assignments + km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() + logging.info("Starting find pairs") os.makedirs(output_folder, exist_ok=True) - + rng = np.random.default_rng(seed) iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING)) - + with Pool(processes=processes_count) as pool: pool.map( partial( find_match_iteration, - k_parquet_filename, - m_parquet_filename, + km_pixels, + km_pca, start_year, + luc_match, output_folder ), iteration_seeds @@ -346,6 +287,13 @@ def main(): dest="start_year", help="Year project started." ) + parser.add_argument( + "--luc_match", + type=bool, + required=True, + dest="luc_match", + help="Boolean determines whether matching should include LUCs." + ) parser.add_argument( "--seed", type=int, @@ -374,6 +322,7 @@ def main(): args.k_filename, args.m_filename, args.start_year, + args.luc_match, args.seed, args.output_directory_path, args.processes_count From 4a7abcec0f6af5838065e0650142db6e537929c8 Mon Sep 17 00:00:00 2001 From: swinersha Date: Wed, 7 Aug 2024 13:34:41 +0000 Subject: [PATCH 3/6] fix: sppeds up find pairs by sampling M to the size of K --- .../cluster_find_pairs_interactive.py | 303 +++--------------- methods/matching/find_pairs.py | 54 ++-- 2 files changed, 80 insertions(+), 277 deletions(-) diff --git a/methods/matching/cluster_find_pairs_interactive.py b/methods/matching/cluster_find_pairs_interactive.py index 9ac25fb..215722b 100644 --- a/methods/matching/cluster_find_pairs_interactive.py +++ b/methods/matching/cluster_find_pairs_interactive.py @@ -7,6 +7,7 @@ from sklearn.cluster import KMeans import matplotlib.pyplot as plt import geopandas as gpd +from pyproj import Proj, transform import os import time import sys @@ -108,14 +109,53 @@ def calculate_smd(group1, group2): # Define the start year t0 = 2012 # READ THIS IN -match_years = [t0-10, t0-5, t0] # Read in the data boundary = gpd.read_file('/maps/aew85/projects/1201.geojson') -k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet') +k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k.parquet') +# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet') m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet') + +t0 = 2018 +boundary = gpd.read_file('/maps/aew85/projects/ona.geojson') +k_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet') +m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet') + +if(m_pixels.shape[0] > (k_pixels.shape[0])): + m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + m_pixels = m_pixels.iloc[m_random_indices] + +# # Calculate the central coordinates (centroid) +# central_lat = m_pixels['lat'].mean() +# central_lon = m_pixels['lng'].mean() +# aeqd_proj = f"+proj=aeqd +lat_0={central_lat} +lon_0={central_lon} +datum=WGS84" + +# # Convert the DataFrame to a GeoDataFrame +# m_gdf = gpd.GeoDataFrame(m_pixels, geometry=gpd.points_from_xy(m_pixels.lng, m_pixels.lat)) +# # Set the original CRS to WGS84 (EPSG:4326) +# m_gdf.set_crs(epsg=4326, inplace=True) + +# # Transform the GeoDataFrame to the AEQD projection +# m_gdf_aeqd = m_gdf.to_crs(aeqd_proj) + +# # Extract the transformed coordinates +# gdf_aeqd['aeqd_x'] = gdf_aeqd.geometry.x +# gdf_aeqd['aeqd_y'] = gdf_aeqd.geometry.y + +# # Define the grid resolution in meters +# grid_resolution_m = 5000 # 5 km + +# # Calculate grid cell indices +# gdf_aeqd['grid_x'] = (gdf_aeqd['aeqd_x'] // grid_resolution_m).astype(int) +# gdf_aeqd['grid_y'] = (gdf_aeqd['aeqd_y'] // grid_resolution_m).astype(int) + +# # Print the first few rows to verify +# print(gdf_aeqd.head()) + + # concat m and k km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) @@ -124,8 +164,6 @@ def calculate_smd(group1, group2): exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt'] exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')] -match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] - # Extract only the continuous columns continuous_columns = km_pixels.columns.difference(exclude_columns) km_pixels_selected = km_pixels[continuous_columns] @@ -178,7 +216,7 @@ def calculate_smd(group1, group2): plt.ylabel('PCA Component 2') plt.legend() plt.show() - plt.savefig('Figures/cluster_centres_faiss_1.png') + plt.savefig('Figures/ona_cluster_centres_faiss_1.png') plt.close() # Close the plot to free up memory #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -202,11 +240,14 @@ def calculate_smd(group1, group2): fig.delaxes(axes[j]) plt.tight_layout() - plt.savefig('Figures/cluster_faiss_1_facet.png') + plt.savefig('Figures/Ona_cluster_faiss_1_facet.png') plt.close() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +match_years = [t0-10, t0-5, t0] +match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] + # Extract K and M pixels k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] @@ -216,7 +257,7 @@ def calculate_smd(group1, group2): m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) -m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) +m_sub_size = int(m_pixels.shape[0] * 1) # Define indexs for the samples from K and M k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) @@ -330,252 +371,4 @@ def calculate_smd(group1, group2): smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) print(smd_df) -smd_results = [] -for column in columns_to_compare: - smd, mean1, mean2, pooled_std = calculate_smd(k_pixels[column], m_pixels[column]) - smd_results.append((column, smd, mean1, mean2, pooled_std)) -# Convert the results to a DataFrame for better readability -smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) -print(smd_df) - - - - - - - -def find_match_iteration( - k_parquet_filename: str, - m_parquet_filename: str, - start_year: int, - output_folder: str, - idx_and_seed: tuple[int, int] -) -> None: - logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING) - rng = np.random.default_rng(idx_and_seed[1]) - - logging.info("Loading K from %s", k_parquet_filename) - k_pixels = pd.read_parquet(k_parquet_filename) - - logging.info("Loading M from %s", m_parquet_filename) - m_pixels = pd.read_parquet(m_parquet_filename) - - # concat m and k - km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), - m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) - - # Find the continuous columns - exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt'] - exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')] - continuous_columns = km_pixels.columns.difference(exclude_columns) - # Categorical columns - match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years] - - logging.info("Starting PCA transformation of k and m union. km_pixels.shape: %a", {km_pixels.shape}) - # PCA transform and conversion to 32 bit ints for continuous only - km_pca = to_pca_int32(km_pixels[continuous_columns]) - logging.info("Done PCA transformation") - - # Extract K and M pixels - this might be unnecessary I just wanted to make sure - # K and M were in the same order here and in the PCA transform - k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] - m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] - # Extract K and M PCA transforms - k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy() - m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy() - - # Sample from K and M - k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION) - m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION) - # Define indexs for the samples from K and M - k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False) - m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) - # Take random samples from K and M pixels - k_sub = k_pixels.iloc[k_random_indices] - m_sub = m_pixels.iloc[m_random_indices] - # Take corresponding random samples from the PCA transformed K and M - k_sub_pca = k_pca[k_random_indices,:] - m_sub_pca = m_pca[m_random_indices,:] - - logging.info("Samples taken from K and M. k_sub.shape: %a; m_sub.shape: %a", {k_sub.shape, m_sub.shape}) - - # Identify the unique combinations of luc columns - k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats)) - - pairs_list = [] - matchless_list = [] - - logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s", - k_sub.shape, m_sub.shape) - - start_time = time.time() - for i in range(0, k_cat_combinations.shape[0]): - # i = 6 # ith element of the unique combinations of the luc time series in k - # for in range() - k_cat_comb = k_cat_combinations.iloc[i] - k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)] - k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)] - - # Find the subset in km_pixels that matches this combination - m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)] - m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)] - - if VERBOSE: - print('ksub_cat:' + str(k_cat.shape[0])) - print('msub_cat:' + str(m_cat.shape[0])) - - # If there is no suitable match for the pre-project luc time series - # Then it may be preferable to just take the luc state at t0 - # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)] - # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)] - # For if there are no matches return nothing - - if(m_cat.shape[0] < k_cat.shape[0] * 5): - # print("M insufficient for matching. Set VERBOSE to True for more details.") - # Append the matchless DataFrame to the list - matchless_list.append(k_cat) - continue - - # Find the matches - matches_index = loop_match(m_cat_pca, k_cat_pca) - m_cat_matches = m_cat.iloc[matches_index] - - # i = 0 - # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True) - # matched.columns = ['trt', 'ctrl'] - # matched - #Looks great! - columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope'] - # Calculate SMDs for the specified columns - smd_results = [] - for column in columns_to_compare: - smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column]) - smd_results.append((column, smd, mean1, mean2, pooled_std)) - - # Convert the results to a DataFrame for better readability - smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std']) - - if VERBOSE: - # Print the results - print("categorical combination:") - print(k_cat_comb) - # Count how many items in 'column1' are not equal to the specified integer value - print("LUC flips in K:") - (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() - print("LUC flips in matches:") - (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum() - print("Standardized Mean Differences:") - print(smd_df) - - # Join the pairs into one dataframe: - k_cat = k_cat.reset_index(drop = True) - m_cat_matches = m_cat_matches.reset_index(drop = True) - pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1) - - # Append the resulting DataFrame to the list - pairs_list.append(pairs_df) - - # Combine all the DataFrames in the list into a single DataFrame - pairs = pd.concat(pairs_list, ignore_index=True) - matchless = pd.concat(matchless_list, ignore_index=True) - - logging.info("Finished greedy matching... pairs.shape: %s, matchless.shape: %s", - pairs.shape, matchless.shape) - - logging.info("Starting storing matches...") - pairs.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet')) - matchless.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet')) - - logging.info("Finished find match iteration") - - -def find_pairs( - k_parquet_filename: str, - m_parquet_filename: str, - start_year: int, - seed: int, - output_folder: str, - processes_count: int -) -> None: - logging.info("Starting find pairs") - os.makedirs(output_folder, exist_ok=True) - - rng = np.random.default_rng(seed) - iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING)) - - with Pool(processes=processes_count) as pool: - pool.map( - partial( - find_match_iteration, - k_parquet_filename, - m_parquet_filename, - start_year, - output_folder - ), - iteration_seeds - ) - - -def main(): - # If you use the default multiprocess model then you risk deadlocks when logging (which we - # have hit). Spawn is the default on macOS, but not on Linux. - set_start_method("spawn") - - parser = argparse.ArgumentParser(description="Takes K and M and finds 100 sets of matches.") - parser.add_argument( - "--k", - type=str, - required=True, - dest="k_filename", - help="Parquet file containing pixels from K as generated by calculate_k.py" - ) - parser.add_argument( - "--m", - type=str, - required=True, - dest="m_filename", - help="Parquet file containing pixels from M as generated by build_m_table.py" - ) - parser.add_argument( - "--start_year", - type=int, - required=True, - dest="start_year", - help="Year project started." - ) - parser.add_argument( - "--seed", - type=int, - required=True, - dest="seed", - help="Random number seed, to ensure experiments are repeatable." - ) - parser.add_argument( - "--output", - type=str, - required=True, - dest="output_directory_path", - help="Directory into which output matches will be written. Will be created if it does not exist." - ) - parser.add_argument( - "-j", - type=int, - required=False, - default=round(cpu_count() / 2), - dest="processes_count", - help="Number of concurrent threads to use." - ) - args = parser.parse_args() - - find_pairs( - args.k_filename, - args.m_filename, - args.start_year, - args.seed, - args.output_directory_path, - args.processes_count - ) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py index 62e9e9c..e253014 100644 --- a/methods/matching/find_pairs.py +++ b/methods/matching/find_pairs.py @@ -20,11 +20,15 @@ # m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/tom_pairs/matches.parquet' # luc_match = True +t0 = 2018 +k_parquet_filename = '/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet' +m_parquet_filename = '/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet' + REPEAT_MATCH_FINDING = 100 DEBUG = False K_SUB_PROPORTION = 0.01 -M_SUB_PROPORTION = 0.1 +M_SUB_PROPORTION = 1 # Number of clusters NUM_CLUSTERS = 9 # Number of iterations for K means fitting @@ -98,8 +102,8 @@ def calculate_smd(group1, group2): return smd, mean1, mean2, pooled_std def find_match_iteration( - km_pixels: pd.DataFrame, - km_pca: np.ndarray, + k_pixels: pd.DataFrame, + m_pixels: pd.DataFrame, start_year: int, luc_match: bool, output_folder: str, @@ -115,6 +119,29 @@ def find_match_iteration( else: match_cats = ["ecoregion", "country", "cluster"] + if(m_pixels.shape[0] > (k_pixels.shape[0])): + m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points + m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False) + m_pixels = m_pixels.iloc[m_random_indices] + + # concat m and k + km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), + m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) + + # Extract only the continuous columns + km_pixels_distance = km_pixels[DISTANCE_COLUMNS] + # PCA transform and conversion to 32 bit ints + logging.info("Transforming continuous variables to PCA space") + km_pca = to_pca_int32(km_pixels_distance) + # Find clusters using Kmeans + logging.info("Starting cluster assignment using kmeans") + # Initialize the KMeans object + kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True) + # Perform clustering + kmeans.train(km_pca) + # Get cluster assignments + km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() + # Extract K and M pixels k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt'] m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl'] @@ -223,23 +250,6 @@ def find_pairs( k_pixels = pd.read_parquet(k_parquet_filename) logging.info("Loading M from %s", m_parquet_filename) m_pixels = pd.read_parquet(m_parquet_filename) - # concat m and k - km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))), - m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True) - - # Extract only the continuous columns - km_pixels_distance = km_pixels[DISTANCE_COLUMNS] - # PCA transform and conversion to 32 bit ints - logging.info("Transforming continuous variables to PCA space") - km_pca = to_pca_int32(km_pixels_distance) - # Find clusters using Kmeans - logging.info("Starting cluster assignment using kmeans") - # Initialize the KMeans object - kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True) - # Perform clustering - kmeans.train(km_pca) - # Get cluster assignments - km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten() logging.info("Starting find pairs") os.makedirs(output_folder, exist_ok=True) @@ -251,8 +261,8 @@ def find_pairs( pool.map( partial( find_match_iteration, - km_pixels, - km_pca, + k_pixels, + m_pixels, start_year, luc_match, output_folder From 81377a6217247dd8635223a4cac2bb66785c50de Mon Sep 17 00:00:00 2001 From: Abby Williams <149403544+abbyevewilliams@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:42:37 +0100 Subject: [PATCH 4/6] Update calculate_k.py Removed the problematic lines for ecoregion and access --- methods/matching/calculate_k.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py index dba2284..60ad146 100644 --- a/methods/matching/calculate_k.py +++ b/methods/matching/calculate_k.py @@ -73,7 +73,6 @@ def build_layer_collection( # RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in # glob.glob("*.tif", root_dir=ecoregions_directory_path) # ], name="ecoregions") - ecoregions = RasterLayer.layer_from_file(ecoregions_directory_path) elevation = GroupLayer([ RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in @@ -88,7 +87,6 @@ def build_layer_collection( # RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in # glob.glob("*.tif", root_dir=access_directory_path) # ], name="access") - access = RasterLayer.layer_from_file(access_directory_path) countries = RasterLayer.layer_from_file(countries_raster_filename) From 2bec9d97789b45c077481591c0c2465d9fee7b66 Mon Sep 17 00:00:00 2001 From: Abby Williams <149403544+abbyevewilliams@users.noreply.github.com> Date: Thu, 8 Aug 2024 12:46:30 +0100 Subject: [PATCH 5/6] Update calculate_k.py Uncommented lines needed for ecoregion and access --- methods/matching/calculate_k.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py index 60ad146..a5fa156 100644 --- a/methods/matching/calculate_k.py +++ b/methods/matching/calculate_k.py @@ -69,10 +69,10 @@ def build_layer_collection( # ecoregions is such a heavy layer it pays to just rasterize it once - we should possibly do this once # as part of import of the ecoregions data - # ecoregions = GroupLayer([ - # RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in - # glob.glob("*.tif", root_dir=ecoregions_directory_path) - # ], name="ecoregions") + ecoregions = GroupLayer([ + RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in + glob.glob("*.tif", root_dir=ecoregions_directory_path) + ], name="ecoregions") elevation = GroupLayer([ RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in @@ -83,10 +83,10 @@ def build_layer_collection( glob.glob("slope*.tif", root_dir=slope_directory_path) ], name="slopes") - # access = GroupLayer([ - # RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in - # glob.glob("*.tif", root_dir=access_directory_path) - # ], name="access") + access = GroupLayer([ + RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in + glob.glob("*.tif", root_dir=access_directory_path) + ], name="access") countries = RasterLayer.layer_from_file(countries_raster_filename) From fa83f7ade3c632a9fb5783230f39cfae7c4c7c7e Mon Sep 17 00:00:00 2001 From: swinersha Date: Mon, 12 Aug 2024 15:17:21 +0000 Subject: [PATCH 6/6] feat: code to tile M into spatial cells added --- methods/matching/tile_m.py | 111 +++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 methods/matching/tile_m.py diff --git a/methods/matching/tile_m.py b/methods/matching/tile_m.py new file mode 100644 index 0000000..8df0064 --- /dev/null +++ b/methods/matching/tile_m.py @@ -0,0 +1,111 @@ + +import pandas as pd +import numpy as np +from pyproj import Proj +from numba import jit +import time +import os + +grid_resolution_m = 5000 + +# Read in the data +m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/matches.parquet' + +m_parquet_dirname = os.path.dirname(m_parquet_filename) + +k_pixels = pd.read_parquet(k_parquet_filename) +# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet') +m_pixels = pd.read_parquet(m_parquet_filename) + +def assign_cells(aeqd_x, aeqd_y): + grid_x = (aeqd_x // grid_resolution_m).astype(int) + grid_y = (aeqd_y // grid_resolution_m).astype(int) + return grid_x, grid_y + + +# Create the directory if it doesn't exist +output_dir = m_parquet_dirname + '/m_tiles' +if not os.path.exists(output_dir): + os.makedirs(output_dir) + +# Define the AEQD projection +lat_mean = m_pixels['lat'].mean() +lng_mean = m_pixels['lng'].mean() +aeqd_proj = Proj(proj='aeqd', lat_0=lat_mean, lon_0=lng_mean, datum='WGS84') + +# Vectorized transformation using pyproj (no need for a loop) +m_pixels['aeqd_x'], m_pixels['aeqd_y'] = aeqd_proj(m_pixels['lng'].values, m_pixels['lat'].values) +m_pixels['grid_x'], m_pixels['grid_y'] = assign_cells(m_pixels['aeqd_x'], m_pixels['aeqd_y']) + +print(m_pixels[['grid_x', 'grid_y']]) + +# Create a unique identifier for each grid cell +m_pixels['grid_id'] = m_pixels['grid_x'].astype(str) + '_' + m_pixels['grid_y'].astype(str) +# Check the number of unique grid cells +print(f"Number of unique grid cells: {m_pixels['grid_id'].nunique()}") + +# # Create a color palette with a unique color for each grid_id +# unique_grid_ids = m_pixels['grid_id'].unique() +# palette = sns.color_palette("husl", len(unique_grid_ids)) +# color_dict = dict(zip(unique_grid_ids, palette)) + +# # Map the grid_id to a color +# m_pixels['color'] = m_pixels['grid_id'].map(color_dict) + +# # Plot the points +# import matplotlib.pyplot as plt +# import seaborn as sns +# plt.figure(figsize=(10, 8)) +# plt.scatter(m_pixels['lng'], m_pixels['lat'], c=m_pixels['color'], s=1, alpha=0.6) +# plt.title('Points Colored by Unique Grid Cell') +# plt.xlabel('Longitude') +# plt.ylabel('Latitude') +# plt.savefig('Figures/Gola_m_grid_cells.png') +# plt.close() + +# Loop through each unique grid_id +for grid_id in m_pixels['grid_id'].unique(): + # Filter the data for this grid_id + grid_data = m_pixels[m_pixels['grid_id'] == grid_id] + + # Create a unique file name + file_name = f"m_tile_{grid_id}.parquet" + file_path = os.path.join(output_dir, file_name) + + # Save the DataFrame as a parquet file + grid_data.to_parquet(file_path, index=False) + print(f"Saved {file_name} with {len(grid_data)} records.") + + +#----------------------------------------- +# Code to move to find pairs +# just testing for now + + + +import random + +# Directory containing the Parquet files +input_dir = m_parquet_dirname + '/m_tiles' + +# List all Parquet files in the directory +all_files = [f for f in os.listdir(input_dir) if f.endswith('.parquet')] + +# Determine the number of files to read (10% of total files) +num_files_to_read = max(1, int(0.1 * len(all_files))) # Ensure at least 1 file is read + +# Randomly select 10% of the files +selected_files = random.sample(all_files, num_files_to_read) + +# Read the selected files and store them in a list of DataFrames +dfs = [] +for file_name in selected_files: + file_path = os.path.join(input_dir, file_name) + df = pd.read_parquet(file_path) + dfs.append(df) + +# Concatenate all the DataFrames into a single DataFrame +combined_df = pd.concat(dfs, ignore_index=True) + +# Print information about the loaded data +print(f"Loaded {len(dfs)} files with a total of {len(combined_df)} records.") \ No newline at end of file