From 2cfc1ba4ea7f9cb2bd5c7e8707f8baa8f2c081e4 Mon Sep 17 00:00:00 2001
From: swinersha <twswinfield@gmai.com>
Date: Wed, 31 Jul 2024 13:44:46 +0000
Subject: [PATCH 1/6] feat: k_all pixels and interactive find pairs drafted

---
 .gitignore                                    |   2 +-
 methods/matching/calculate_k.py               |  29 +-
 .../cluster_find_pairs_interactive.py         | 581 ++++++++++++++++++
 3 files changed, 599 insertions(+), 13 deletions(-)
 create mode 100644 methods/matching/cluster_find_pairs_interactive.py

diff --git a/.gitignore b/.gitignore
index eac8079..123285e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .DS_Store
 *.npy
 data
-
+Figures
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py
index dfecd05..dba2284 100644
--- a/methods/matching/calculate_k.py
+++ b/methods/matching/calculate_k.py
@@ -21,10 +21,13 @@
 LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE = 0.05
 
 # The '2 *' in this is because I'm just considering one axis, rather than area
-PIXEL_SKIP_SMALL_PROJECT = \
-    round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS)
-PIXEL_SKIP_LARGE_PROJECT = \
-    round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS)
+# PIXEL_SKIP_SMALL_PROJECT = \
+#     round((HECTARE_WIDTH_IN_METERS / (2 * SMALL_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS)
+# PIXEL_SKIP_LARGE_PROJECT = \
+#     round((HECTARE_WIDTH_IN_METERS / (2 * LARGE_PROJECT_PIXEL_DENSITY_PER_HECTARE)) / PIXEL_WIDTH_IN_METERS)
+
+PIXEL_SKIP_SMALL_PROJECT = 1
+PIXEL_SKIP_LARGE_PROJECT = 1
 
 MatchingCollection = namedtuple('MatchingCollection',
     ['boundary', 'lucs', 'cpcs', 'ecoregions', 'elevation', 'slope', 'access', 'countries'])
@@ -66,10 +69,11 @@ def build_layer_collection(
 
     # ecoregions is such a heavy layer it pays to just rasterize it once - we should possibly do this once
     # as part of import of the ecoregions data
-    ecoregions = GroupLayer([
-        RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in
-            glob.glob("*.tif", root_dir=ecoregions_directory_path)
-    ], name="ecoregions")
+    # ecoregions = GroupLayer([
+    #     RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in
+    #         glob.glob("*.tif", root_dir=ecoregions_directory_path)
+    # ], name="ecoregions")
+    ecoregions = RasterLayer.layer_from_file(ecoregions_directory_path)
 
     elevation = GroupLayer([
         RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in
@@ -80,10 +84,11 @@ def build_layer_collection(
             glob.glob("slope*.tif", root_dir=slope_directory_path)
     ], name="slopes")
 
-    access = GroupLayer([
-        RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in
-            glob.glob("*.tif", root_dir=access_directory_path)
-    ], name="access")
+    # access = GroupLayer([
+    #     RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in
+    #         glob.glob("*.tif", root_dir=access_directory_path)
+    # ], name="access")
+    access = RasterLayer.layer_from_file(access_directory_path)
 
     countries = RasterLayer.layer_from_file(countries_raster_filename)
 
diff --git a/methods/matching/cluster_find_pairs_interactive.py b/methods/matching/cluster_find_pairs_interactive.py
new file mode 100644
index 0000000..9ac25fb
--- /dev/null
+++ b/methods/matching/cluster_find_pairs_interactive.py
@@ -0,0 +1,581 @@
+# source myenv/bin/activate
+
+import pandas as pd
+import numpy as np
+from numba import njit
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+import geopandas as gpd
+import os
+import time
+import sys
+import faiss
+
+# Read K and M
+# Repeat 100 times - Ultimately we would like to remove this step
+#   PCA across K and M
+#   Divide into clusters - Check how much splitting into clusters speeds things up
+#   Sample K to find K_sub
+#   Sample M to find M_sub
+#   Split M into LUC combinations - How big is each set?
+#   For each cluster in K_sub
+#       Split cluster in K_sub into categorical combinations - How big is each set?
+#       For each categorical combination
+#           Sample from the equivalent cluster and categorical combindation in M_sub
+#           Find pairs for the categorical combinations in K_sub from the categorical combinations in M_sub
+#       RowBind categorical combination sets
+#   RowBind cluster sets
+#   Save Pairs
+
+
+# NOTES
+# 1. We might need to combine some categorical subsets because at least for comparing validity of the matches
+#    because if the subsets are too small the standardised mean differences can be very wrong
+# 2. One option would be to combine clusters based on the proximity of the cluster centres. For the LUCs, we might 
+#    combine groups that are in the same state when the project begins, even if the LUC history is not the same.
+# 3. There is a question of how much supposed additionality is created by each categorical subset? If it is 
+#    nothing, it might not matter. If it is substantive then it definitely does matter.
+
+
+
+# this function uses loops instead of numpy vector operations
+@njit
+def loop_match(m_pca, k_pca):
+    picked = np.ones((m_pca.shape[0],), dtype=np.bool_)
+    fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32)
+    for i in range(0, k_pca.shape[0]):
+        min_squared_diff_sum = np.inf
+        min_squared_diff_j = -1
+        for j in range(0, m_pca.shape[0]):
+            if picked[j]:
+                squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2)
+                if squared_diff_sum < min_squared_diff_sum:
+                    min_squared_diff_sum = squared_diff_sum
+                    min_squared_diff_j = j
+        fast_matches[i] = min_squared_diff_j
+        picked[min_squared_diff_j] = False
+    return fast_matches
+
+### Now try with real numbers
+
+def to_int32(x):
+    # Normalize the data to the range 0 to 1
+    min_val = np.min(x)
+    max_val = np.max(x)
+    normalized_data = (x - min_val) / (max_val - min_val)
+    
+    # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers
+    scaled_data = normalized_data * 255
+    
+    # Convert to 32-bit integers (0 to 255)
+    int32_data = scaled_data.astype(np.int32)
+    
+    return int32_data
+
+def to_pca_int32(x):
+    # Perform PCA and convert to dataframe
+    pca = PCA(n_components=min(len(x), len(x.columns)), 
+            whiten=False)  # Centering and scaling done by default
+    pca_result = pca.fit_transform(x)
+    pca_df = pd.DataFrame(pca_result)
+    # Convert all columns to int8
+    pca_32 = pca_df.apply(to_int32)
+    return pca_32
+
+
+def calculate_smd(group1, group2):
+    # Means
+    mean1, mean2 = np.mean(group1), np.mean(group2)
+    # Standard deviations
+    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
+    # Sample sizes
+    n1, n2 = len(group1), len(group2)
+    # Pooled standard deviation
+    pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
+    # Standardized mean difference
+    smd = (mean1 - mean2) / pooled_std
+    return smd, mean1, mean2, pooled_std
+
+
+K_SUB_PROPORTION = 0.01
+M_SUB_PROPORTION = 0.1
+# Number of clusters
+N_CLUSTERS = 9
+# Number of iterations for K means fitting
+N_ITERATIONS = 100
+VERBOSE = True
+
+# Define the start year
+t0 = 2012 # READ THIS IN
+match_years = [t0-10, t0-5, t0]
+
+# Read in the data
+boundary = gpd.read_file('/maps/aew85/projects/1201.geojson')
+
+k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
+m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet')
+
+# concat m and k
+km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
+                       m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
+
+# Select columns (excluding 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt', and those starting with 'luc')
+exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
+exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
+
+match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
+
+# Extract only the continuous columns
+continuous_columns = km_pixels.columns.difference(exclude_columns)
+km_pixels_selected = km_pixels[continuous_columns]
+
+# PCA transform and conversion to 32 bit ints
+km_pca = to_pca_int32(km_pixels_selected)
+
+# Looks good
+km_pca.head()
+
+#------------------------------------------
+
+# Initialize the KMeans object
+kmeans = faiss.Kmeans(d=km_pca.shape[1], k=N_CLUSTERS, niter=N_ITERATIONS, verbose=True)
+# Perform clustering
+kmeans.train(km_pca)
+
+# Get cluster assignments
+km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten()
+
+# Frequency distribution in each cluster
+cluster_counts = pd.Series(km_pixels['cluster']).value_counts()
+if VERBOSE:
+    print("Cluster counts:\n", cluster_counts)
+
+
+# Convert to spatial (simple features)
+km_pixels_sf = gpd.GeoDataFrame(
+    km_pixels,
+    geometry=gpd.points_from_xy(km_pixels['lng'], km_pixels['lat']),
+    crs="EPSG:4326"
+)
+
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Plot cluster centres
+
+# Get cluster centers
+centroids = kmeans.centroids
+
+if VERBOSE:
+    # Plot the cluster centers
+    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=100, marker='x', label='Cluster centers')
+    # Add cluster IDs as labels on the plot
+    for i, center in enumerate(centroids):
+        plt.text(center[0], center[1], str(i), color='red', fontsize=12, weight='bold')
+    
+    plt.title('K-means Clustering with Faiss')
+    plt.xlabel('PCA Component 1')
+    plt.ylabel('PCA Component 2')
+    plt.legend()
+    plt.show()
+    plt.savefig('Figures/cluster_centres_faiss_1.png')
+    plt.close()  # Close the plot to free up memory
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Plot clusters as separate facets
+if VERBOSE:
+    fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
+    axes = axes.flatten()
+    
+    clusters = sorted(km_pixels_sf['cluster'].unique())
+    
+    for i, cluster in enumerate(clusters):
+        ax = axes[i]
+        cluster_data = km_pixels_sf[km_pixels_sf['cluster'] == cluster]
+        cluster_data.plot(ax=ax, color='blue', markersize=0.2)
+        boundary.plot(ax=ax, edgecolor='black', facecolor='none')
+        ax.set_title(f'Cluster {cluster}')
+        ax.set_axis_off()
+    
+    # Turn off any unused subplots
+    for j in range(len(clusters), len(axes)):
+        fig.delaxes(axes[j])
+    
+    plt.tight_layout()
+    plt.savefig('Figures/cluster_faiss_1_facet.png')
+    plt.close()
+
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Extract K and M pixels 
+k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
+m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
+
+# Extract K and M PCA transforms
+k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy()
+m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
+
+k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
+m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
+
+# Define indexs for the samples from K and M
+k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
+m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+
+# Take random samples from K and M pixels
+k_sub = k_pixels.iloc[k_random_indices]
+m_sub = m_pixels.iloc[m_random_indices]
+
+# Take corresponding random samples from the PCA transformed K and M 
+k_sub_pca = k_pca[k_random_indices,:]
+m_sub_pca = m_pca[m_random_indices,:]
+
+if VERBOSE:
+    # Handy code for displaying the number of counts in each unique category combination
+    # In K
+    k_combination_counts = k_sub.groupby(match_cats).size().reset_index(name='counts')
+    print("k_combination_counts")
+    print(k_combination_counts)
+    # In M
+    m_combination_counts = m_sub.groupby(match_cats).size().reset_index(name='counts')
+    print("m_combination_counts")
+    print(m_combination_counts)
+
+
+# Identify the unique combinations of luc columns
+k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats))
+
+pairs_list = []
+
+start_time = time.time()
+for i in range(0, k_cat_combinations.shape[0]):
+    # i = 6 # ith element of the unique combinations of the luc time series in k
+    # for in range()
+    k_cat_comb = k_cat_combinations.iloc[i]
+    k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
+    k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
+    
+    # Find the subset in km_pixels that matches this combination
+    m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
+    m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
+    
+    if VERBOSE:
+        print('ksub_cat:' + str(k_cat.shape[0]))
+        print('msub_cat:' + str(m_cat.shape[0]))
+    
+    # If there is no suitable match for the pre-project luc time series
+    # Then it may be preferable to just take the luc state at t0
+    # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)]
+    # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)]
+    # For if there are no matches return nothing
+    
+    if(m_cat.shape[0] < k_cat.shape[0] * 5):
+        print("M insufficient for matching. Set VERBOSE to True for more details.")
+        continue
+    
+    matches_index = loop_match(m_cat_pca, k_cat_pca)
+    m_cat_matches = m_cat.iloc[matches_index]
+    
+    # i = 0
+    # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True)
+    # matched.columns = ['trt', 'ctrl']
+    # matched
+    #Looks great!
+    columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope']
+    # Calculate SMDs for the specified columns
+    smd_results = []
+    for column in columns_to_compare:
+        smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column])
+        smd_results.append((column, smd, mean1, mean2, pooled_std))
+    
+    # Convert the results to a DataFrame for better readability
+    smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
+    
+    if VERBOSE:
+        # Print the results
+        print("categorical combination:")
+        print(k_cat_comb)
+        # Count how many items in 'column1' are not equal to the specified integer value
+        print("LUC flips in K:")
+        (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
+        print("LUC flips in matches:")
+        (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
+        print("Standardized Mean Differences:")
+        print(smd_df)
+    
+    # Join the pairs into one dataframe:
+    k_cat = k_cat.reset_index(drop = True)
+    m_cat_matches = m_cat_matches.reset_index(drop = True)  
+    pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1)
+    
+    # Append the resulting DataFrame to the list
+    pairs_list.append(pairs_df)
+
+# Combine all the DataFrames in the list into a single DataFrame
+combined_pairs = pd.concat(pairs_list, ignore_index=True)
+
+end_time = time.time()
+elapsed_time = end_time - start_time
+if VERBOSE:
+    print(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope']
+# Calculate SMDs for the specified columns
+smd_results = []
+for column in columns_to_compare:
+    smd, mean1, mean2, pooled_std = calculate_smd(combined_pairs['k_' + column], combined_pairs['s_' + column])
+    smd_results.append((column, smd, mean1, mean2, pooled_std))
+
+# Convert the results to a DataFrame for better readability
+smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
+print(smd_df)
+
+smd_results = []
+for column in columns_to_compare:
+    smd, mean1, mean2, pooled_std = calculate_smd(k_pixels[column], m_pixels[column])
+    smd_results.append((column, smd, mean1, mean2, pooled_std))
+
+# Convert the results to a DataFrame for better readability
+smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
+print(smd_df)
+
+
+
+
+
+
+
+def find_match_iteration(
+    k_parquet_filename: str,
+    m_parquet_filename: str,
+    start_year: int,
+    output_folder: str,
+    idx_and_seed: tuple[int, int]
+) -> None:
+    logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING)
+    rng = np.random.default_rng(idx_and_seed[1])
+
+    logging.info("Loading K from %s", k_parquet_filename)
+    k_pixels = pd.read_parquet(k_parquet_filename)
+    
+    logging.info("Loading M from %s", m_parquet_filename)
+    m_pixels = pd.read_parquet(m_parquet_filename)
+    
+    # concat m and k
+    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
+                           m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
+    
+    # Find the continuous columns
+    exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
+    exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
+    continuous_columns = km_pixels.columns.difference(exclude_columns)
+    # Categorical columns
+    match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
+    
+    logging.info("Starting PCA transformation of k and m union. km_pixels.shape: %a", {km_pixels.shape})
+    # PCA transform and conversion to 32 bit ints for continuous only
+    km_pca = to_pca_int32(km_pixels[continuous_columns])
+    logging.info("Done PCA transformation")
+    
+    # Extract K and M pixels  - this might be unnecessary I just wanted to make sure 
+    # K and M were in the same order here and in the PCA transform
+    k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
+    m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
+    # Extract K and M PCA transforms
+    k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy()
+    m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
+    
+    # Sample from K and M
+    k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
+    m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
+    # Define indexs for the samples from K and M
+    k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
+    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+    # Take random samples from K and M pixels
+    k_sub = k_pixels.iloc[k_random_indices]
+    m_sub = m_pixels.iloc[m_random_indices]
+    # Take corresponding random samples from the PCA transformed K and M 
+    k_sub_pca = k_pca[k_random_indices,:]
+    m_sub_pca = m_pca[m_random_indices,:]
+    
+    logging.info("Samples taken from K and M. k_sub.shape: %a; m_sub.shape: %a", {k_sub.shape, m_sub.shape})
+    
+    # Identify the unique combinations of luc columns
+    k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats))
+    
+    pairs_list = []
+    matchless_list = []
+    
+    logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s",
+                                  k_sub.shape, m_sub.shape)
+    
+    start_time = time.time()
+    for i in range(0, k_cat_combinations.shape[0]):
+        # i = 6 # ith element of the unique combinations of the luc time series in k
+        # for in range()
+        k_cat_comb = k_cat_combinations.iloc[i]
+        k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
+        k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
+        
+        # Find the subset in km_pixels that matches this combination
+        m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
+        m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
+        
+        if VERBOSE:
+            print('ksub_cat:' + str(k_cat.shape[0]))
+            print('msub_cat:' + str(m_cat.shape[0]))
+        
+        # If there is no suitable match for the pre-project luc time series
+        # Then it may be preferable to just take the luc state at t0
+        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)]
+        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)]
+        # For if there are no matches return nothing
+        
+        if(m_cat.shape[0] < k_cat.shape[0] * 5):
+            # print("M insufficient for matching. Set VERBOSE to True for more details.")
+            # Append the matchless DataFrame to the list
+            matchless_list.append(k_cat)
+            continue
+        
+        # Find the matches
+        matches_index = loop_match(m_cat_pca, k_cat_pca)
+        m_cat_matches = m_cat.iloc[matches_index]
+        
+        # i = 0
+        # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True)
+        # matched.columns = ['trt', 'ctrl']
+        # matched
+        #Looks great!
+        columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope']
+        # Calculate SMDs for the specified columns
+        smd_results = []
+        for column in columns_to_compare:
+            smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column])
+            smd_results.append((column, smd, mean1, mean2, pooled_std))
+        
+        # Convert the results to a DataFrame for better readability
+        smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
+        
+        if VERBOSE:
+            # Print the results
+            print("categorical combination:")
+            print(k_cat_comb)
+            # Count how many items in 'column1' are not equal to the specified integer value
+            print("LUC flips in K:")
+            (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
+            print("LUC flips in matches:")
+            (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
+            print("Standardized Mean Differences:")
+            print(smd_df)
+        
+        # Join the pairs into one dataframe:
+        k_cat = k_cat.reset_index(drop = True)
+        m_cat_matches = m_cat_matches.reset_index(drop = True)  
+        pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1)
+        
+        # Append the resulting DataFrame to the list
+        pairs_list.append(pairs_df)
+
+    # Combine all the DataFrames in the list into a single DataFrame
+    pairs = pd.concat(pairs_list, ignore_index=True)
+    matchless = pd.concat(matchless_list, ignore_index=True)
+    
+    logging.info("Finished greedy matching... pairs.shape: %s, matchless.shape: %s",
+                                  pairs.shape, matchless.shape)
+    
+    logging.info("Starting storing matches...")
+    pairs.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet'))
+    matchless.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet'))
+    
+    logging.info("Finished find match iteration")
+
+
+def find_pairs(
+    k_parquet_filename: str,
+    m_parquet_filename: str,
+    start_year: int,
+    seed: int,
+    output_folder: str,
+    processes_count: int
+) -> None:
+    logging.info("Starting find pairs")
+    os.makedirs(output_folder, exist_ok=True)
+
+    rng = np.random.default_rng(seed)
+    iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING))
+
+    with Pool(processes=processes_count) as pool:
+        pool.map(
+            partial(
+                find_match_iteration,
+                k_parquet_filename,
+                m_parquet_filename,
+                start_year,
+                output_folder
+            ),
+            iteration_seeds
+        )
+
+
+def main():
+    # If you use the default multiprocess model then you risk deadlocks when logging (which we
+    # have hit). Spawn is the default on macOS, but not on Linux.
+    set_start_method("spawn")
+
+    parser = argparse.ArgumentParser(description="Takes K and M and finds 100 sets of matches.")
+    parser.add_argument(
+        "--k",
+        type=str,
+        required=True,
+        dest="k_filename",
+        help="Parquet file containing pixels from K as generated by calculate_k.py"
+    )
+    parser.add_argument(
+        "--m",
+        type=str,
+        required=True,
+        dest="m_filename",
+        help="Parquet file containing pixels from M as generated by build_m_table.py"
+    )
+    parser.add_argument(
+        "--start_year",
+        type=int,
+        required=True,
+        dest="start_year",
+        help="Year project started."
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        required=True,
+        dest="seed",
+        help="Random number seed, to ensure experiments are repeatable."
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        dest="output_directory_path",
+        help="Directory into which output matches will be written. Will be created if it does not exist."
+    )
+    parser.add_argument(
+        "-j",
+        type=int,
+        required=False,
+        default=round(cpu_count() / 2),
+        dest="processes_count",
+        help="Number of concurrent threads to use."
+    )
+    args = parser.parse_args()
+
+    find_pairs(
+        args.k_filename,
+        args.m_filename,
+        args.start_year,
+        args.seed,
+        args.output_directory_path,
+        args.processes_count
+    )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 088048eb0907685ec24ba6afc20cffe7ee71e2c7 Mon Sep 17 00:00:00 2001
From: swinersha <twswinfield@gmai.com>
Date: Wed, 31 Jul 2024 18:54:55 +0000
Subject: [PATCH 2/6] feat: fast find pairs experiment incorporated into
 executable

---
 methods/matching/find_pairs.py | 469 +++++++++++++++------------------
 1 file changed, 209 insertions(+), 260 deletions(-)

diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py
index 7f17782..62e9e9c 100644
--- a/methods/matching/find_pairs.py
+++ b/methods/matching/find_pairs.py
@@ -3,16 +3,35 @@
 import logging
 from functools import partial
 from multiprocessing import Pool, cpu_count, set_start_method
-from numba import jit  # type: ignore
+from numba import njit  # type: ignore
 import numpy as np
 import pandas as pd
+from sklearn.decomposition import PCA
+import faiss
 
 from methods.common.luc import luc_matching_columns
 
+# TO DO: 
+# 1. Rename columns to luc10, luc5 and luc0 to align with the pipeline
+
+# to delete: 
+# start_year = 2012
+# k_parquet_filename = '/maps/tws36/tmf_pipe_out/1201/k_all.parquet'
+# m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/tom_pairs/matches.parquet'
+# luc_match = True
+
 REPEAT_MATCH_FINDING = 100
-DEFAULT_DISTANCE = 10000000.0
 DEBUG = False
 
+K_SUB_PROPORTION = 0.01
+M_SUB_PROPORTION = 0.1
+# Number of clusters
+NUM_CLUSTERS = 9
+# Number of iterations for K means fitting
+NUM_ITERATIONS = 100
+RELATIVE_MATCH_YEARS = [-10, -5, 0]
+
+
 DISTANCE_COLUMNS = [
     "elevation", "slope", "access",
     "cpc0_u", "cpc0_d",
@@ -23,297 +42,219 @@
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 
+# this function uses loops instead of numpy vector operations
+@njit
+def loop_match(m_pca, k_pca):
+    picked = np.ones((m_pca.shape[0],), dtype=np.bool_)
+    fast_matches = np.full(k_pca.shape[0], -1, dtype=np.int32)
+    for i in range(0, k_pca.shape[0]):
+        min_squared_diff_sum = np.inf
+        min_squared_diff_j = -1
+        for j in range(0, m_pca.shape[0]):
+            if picked[j]:
+                squared_diff_sum = np.sum((m_pca[j, :] - k_pca[i, :])**2)
+                if squared_diff_sum < min_squared_diff_sum:
+                    min_squared_diff_sum = squared_diff_sum
+                    min_squared_diff_j = j
+        fast_matches[i] = min_squared_diff_j
+        picked[min_squared_diff_j] = False
+    return fast_matches
+
+### Now try with real numbers
+
+def to_int32(x):
+    # Normalize the data to the range 0 to 1
+    min_val = np.min(x)
+    max_val = np.max(x)
+    normalized_data = (x - min_val) / (max_val - min_val)
+    # Scale the normalized data to the range 0 to 255 for unsigned 8-bit integers
+    scaled_data = normalized_data * 255
+    # Convert to 32-bit integers (0 to 255)
+    int32_data = scaled_data.astype(np.int32)
+    return int32_data
+
+def to_pca_int32(x):
+    # Perform PCA and convert to dataframe
+    pca = PCA(n_components=min(len(x), len(x.columns)), 
+            whiten=False)  # Centering and scaling done by default
+    pca_result = pca.fit_transform(x)
+    pca_df = pd.DataFrame(pca_result)
+    # Convert all columns to int8
+    pca_32 = pca_df.apply(to_int32)
+    return pca_32
+
+
+def calculate_smd(group1, group2):
+    # Means
+    mean1, mean2 = np.mean(group1), np.mean(group2)
+    # Standard deviations
+    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
+    # Sample sizes
+    n1, n2 = len(group1), len(group2)
+    # Pooled standard deviation
+    pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
+    # Standardized mean difference
+    smd = (mean1 - mean2) / pooled_std
+    return smd, mean1, mean2, pooled_std
+
 def find_match_iteration(
-    k_parquet_filename: str,
-    m_parquet_filename: str,
+    km_pixels: pd.DataFrame,
+    km_pca: np.ndarray,
     start_year: int,
+    luc_match: bool,
     output_folder: str,
     idx_and_seed: tuple[int, int]
 ) -> None:
     logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING)
     rng = np.random.default_rng(idx_and_seed[1])
-
-    logging.info("Loading K from %s", k_parquet_filename)
-
-    # Methodology 6.5.7: For a 10% sample of K
-    k_set = pd.read_parquet(k_parquet_filename)
-    k_subset = k_set.sample(
-        frac=0.1,
-        random_state=rng
-    ).reset_index()
-
-    logging.info("Loading M from %s", m_parquet_filename)
-    m_set = pd.read_parquet(m_parquet_filename)
-
-    # get the column ids for DISTANCE_COLUMNS
-    thresholds_for_columns = np.array([
-            200.0, # Elev
-            2.5, # Slope
-            10.0,  # Access
-            0.1, # CPCs
-            0.1, # CPCs
-            0.1, # CPCs
-            0.1, # CPCs
-            0.1, # CPCs
-            0.1, # CPCs
-    ])
-
-    logging.info("Preparing s_set...")
-
-    m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns
-    k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns
-
-    # convert to float32 numpy arrays and make them contiguous for numba to vectorise
-    m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32)
-    k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32)
-
-    # LUC columns are all named with the year in, so calculate the column names
-    # for the years we are intested in
-    luc0, luc5, luc10 = luc_matching_columns(start_year)
-    # As well as all the LUC columns for later use
-    luc_columns = [x for x in m_set.columns if x.startswith('luc')]
-
-    hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
-    assert len(hard_match_columns) == HARD_COLUMN_COUNT
-
-    # similar to the above, make the hard match columns contiguous float32 numpy arrays
-    m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
-    k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
-
-    # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
-    # pixel in the subsample (which is 10% the size of K) we select 100 pixels.
-    required = 100
-
-    logging.info("Running make_s_set_mask... required: %d", required)
-    starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))
-    s_set_mask_true, no_potentials = make_s_set_mask(
-        m_dist_thresholded,
-        k_subset_dist_thresholded,
-        m_dist_hard,
-        k_subset_dist_hard,
-        starting_positions,
-        required
+    
+    match_years = [start_year + year for year in RELATIVE_MATCH_YEARS]
+    # The categorical columns:
+    if luc_match:
+        match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
+    else:
+        match_cats = ["ecoregion", "country", "cluster"]
+    
+    # Extract K and M pixels 
+    k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
+    m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
+    # Extract K and M PCA transforms
+    k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy()
+    m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
+    # Draw subsamples
+    # Methodology 6.5.7: Needs to be updated !!!
+    k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
+    m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
+    # Define indexs for the samples from K and M
+    k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
+    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+    # Take random samples from K and M pixels
+    k_sub = k_pixels.iloc[k_random_indices]
+    m_sub = m_pixels.iloc[m_random_indices]
+    # Take corresponding random samples from the PCA transformed K and M 
+    k_sub_pca = k_pca[k_random_indices,:]
+    m_sub_pca = m_pca[m_random_indices,:]
+    
+    logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s",
+                 k_sub.shape, m_sub.shape)
+    
+    pairs, matchless = greedy_match(
+        k_sub,
+        m_sub,
+        k_sub_pca,
+        m_sub_pca,
+        match_cats
     )
-
-    logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})
-
-    s_set = m_set[s_set_mask_true]
-    potentials = np.invert(no_potentials)
-
-    k_subset = k_subset[potentials]
-    logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
-
-    # Notes:
-    # 1. Not all pixels may have matches
-    results = []
-    matchless = []
-
-    s_set_for_cov = s_set[DISTANCE_COLUMNS]
-    logging.info("Calculating covariance...")
-    covarience = np.cov(s_set_for_cov, rowvar=False)
-    logging.info("Calculating inverse covariance...")
-    invconv = np.linalg.inv(covarience).astype(np.float32)
-
-    # Match columns are luc10, luc5, luc0, "country" and "ecoregion"
-    s_set_match = s_set[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32)
-    # this is required so numba can vectorise the loop in greedy_match
-    s_set_match = np.ascontiguousarray(s_set_match)
-
-    # Now we do the same thing for k_subset
-    k_subset_match = k_subset[hard_match_columns + DISTANCE_COLUMNS].to_numpy(dtype=np.float32)
-    # this is required so numba can vectorise the loop in greedy_match
-    k_subset_match = np.ascontiguousarray(k_subset_match)
-
-    logging.info("Starting greedy matching... k_subset_match.shape: %s, s_set_match.shape: %s",
-                 k_subset_match.shape, s_set_match.shape)
-
-    add_results, k_idx_matchless = greedy_match(
-        k_subset_match,
-        s_set_match,
-        invconv
-    )
-
+    
+    # Combine all the pairs DataFrames in the list into a single DataFrame
+    combined_pairs = pd.concat(pairs, ignore_index=True)
+    # Combine all the matchess DataFrames in the list into a single DataFrame
+    combined_matchless = pd.concat(matchless, ignore_index=True)
     logging.info("Finished greedy matching...")
-
+    
     logging.info("Starting storing matches...")
-
-    for result in add_results:
-        (k_idx, s_idx) = result
-        k_row = k_subset.iloc[k_idx]
-        match = s_set.iloc[s_idx]
-
-        if DEBUG:
-            for hard_match_column in hard_match_columns:
-                if k_row[hard_match_column] != match[hard_match_column]:
-                    print(k_row)
-                    print(match)
-                    raise ValueError("Hard match inconsistency")
-
-        results.append(
-            [k_row.lat, k_row.lng] + [k_row[x] for x in luc_columns + DISTANCE_COLUMNS] + \
-            [match.lat, match.lng] + [match[x] for x in luc_columns + DISTANCE_COLUMNS]
-        )
-
-    logging.info("Finished storing matches...")
-
-    for k_idx in k_idx_matchless:
-        k_row = k_subset.iloc[k_idx]
-        matchless.append(k_row)
-
-    columns = ['k_lat', 'k_lng'] + \
-        [f'k_{x}' for x in luc_columns + DISTANCE_COLUMNS] + \
-        ['s_lat', 's_lng'] + \
-        [f's_{x}' for x in luc_columns + DISTANCE_COLUMNS]
-
-    results_df = pd.DataFrame(results, columns=columns)
-    results_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet'))
-
-    matchless_df = pd.DataFrame(matchless, columns=k_set.columns)
-    matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet'))
-
+    combined_pairs_df = pd.DataFrame(combined_pairs)
+    combined_pairs_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet'))
+    
+    combined_matchless_df = pd.DataFrame(combined_matchless)
+    combined_matchless_df.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet'))
+    
     logging.info("Finished find match iteration")
 
-@jit(nopython=True, fastmath=True, error_model="numpy")
-def make_s_set_mask(
-    m_dist_thresholded: np.ndarray,
-    k_subset_dist_thresholded: np.ndarray,
-    m_dist_hard: np.ndarray,
-    k_subset_dist_hard: np.ndarray,
-    starting_positions: np.ndarray,
-    required: int
-):
-    m_size = m_dist_thresholded.shape[0]
-    k_size = k_subset_dist_thresholded.shape[0]
-
-    s_include = np.zeros(m_size, dtype=np.bool_)
-    k_miss = np.zeros(k_size, dtype=np.bool_)
-
-    for k in range(k_size):
-        matches = 0
-        k_row = k_subset_dist_thresholded[k, :]
-        k_hard = k_subset_dist_hard[k]
-
-        for index in range(m_size):
-            m_index = (index + starting_positions[k]) % m_size
-
-            m_row = m_dist_thresholded[m_index, :]
-            m_hard = m_dist_hard[m_index]
-
-            should_include = True
-
-            # check that every element of m_hard matches k_hard
-            hard_equals = True
-            for j in range(m_hard.shape[0]):
-                if m_hard[j] != k_hard[j]:
-                    hard_equals = False
-
-            if not hard_equals:
-                should_include = False
-            else:
-                for j in range(m_row.shape[0]):
-                    if abs(m_row[j] - k_row[j]) > 1.0:
-                        should_include = False
-
-            if should_include:
-                s_include[m_index] = True
-                matches += 1
-
-            # Don't find any more M's
-            if matches == required:
-                break
-
-        k_miss[k] = matches == 0
-
-    return s_include, k_miss
-
-# Function which returns a boolean array indicating whether all values in a row are true
-@jit(nopython=True, fastmath=True, error_model="numpy")
-def rows_all_true(rows: np.ndarray):
-    # Don't use np.all because not supported by numba
-
-    # Create an array of booleans for rows in s still available
-    all_true = np.ones((rows.shape[0],), dtype=np.bool_)
-    for row_idx in range(rows.shape[0]):
-        for col_idx in range(rows.shape[1]):
-            if not rows[row_idx, col_idx]:
-                all_true[row_idx] = False
-                break
-
-    return all_true
-
-
-@jit(nopython=True, fastmath=True, error_model="numpy")
 def greedy_match(
-    k_subset: np.ndarray,
-    s_set: np.ndarray,
-    invcov: np.ndarray
+    k_sub: pd.DataFrame,
+    m_sub: pd.DataFrame,
+    k_sub_pca: np.ndarray,
+    m_sub_pca: np.ndarray,
+    match_cats: list
 ):
-    # Create an array of booleans for rows in s still available
-    s_available = np.ones((s_set.shape[0],), dtype=np.bool_)
-    total_available = s_set.shape[0]
-
-    results = []
+    # Identify the unique combinations of categorical columns
+    k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats))
+    
+    # Not all pixels may have matches
+    pairs = []
     matchless = []
-
-    s_tmp = np.zeros((s_set.shape[0],), dtype=np.float32)
-
-    for k_idx in range(k_subset.shape[0]):
-        k_row = k_subset[k_idx, :]
-
-        hard_matches = rows_all_true(s_set[:, :HARD_COLUMN_COUNT] == k_row[:HARD_COLUMN_COUNT]) & s_available
-        hard_matches = hard_matches.reshape(
-            -1,
-        )
-
-        if total_available > 0:
-            # Now calculate the distance between the k_row and all the hard matches we haven't already matched
-            s_tmp[hard_matches] = batch_mahalanobis_squared(
-                s_set[hard_matches, HARD_COLUMN_COUNT:], k_row[HARD_COLUMN_COUNT:], invcov
-            )
-            # Find the index of the minimum distance in s_tmp[hard_matches] but map it back to the index in s_set
-            if np.any(hard_matches):
-                min_dist_idx = np.argmin(s_tmp[hard_matches])
-                min_dist_idx = np.arange(s_tmp.shape[0])[hard_matches][min_dist_idx]
-
-                results.append((k_idx, min_dist_idx))
-                s_available[min_dist_idx] = False
-                total_available -= 1
-        else:
-            matchless.append(k_idx)
-
-    return (results, matchless)
-
-# optimised batch implementation of mahalanobis distance that returns a distance per row
-@jit(nopython=True, fastmath=True, error_model="numpy")
-def batch_mahalanobis_squared(rows, vector, invcov):
-    # calculate the difference between the vector and each row (broadcasted)
-    diff = rows - vector
-    # calculate the distance for each row in one batch
-    dists = (np.dot(diff, invcov) * diff).sum(axis=1)
-    return dists
-
+    
+    for i in range(0, k_cat_combinations.shape[0]):
+        # i = 6 # ith element of the unique combinations of the luc time series in k
+        # for in range()
+        k_cat_comb = k_cat_combinations.iloc[i]
+        k_cat_index = k_sub[match_cats] == k_cat_comb
+        k_cat = k_sub[(k_cat_index).all(axis=1)]
+        k_cat_pca = k_sub_pca[(k_cat_index).all(axis=1)]
+        
+        # Find the subset in km_pixels that matches this combination
+        m_cat_index = m_sub[match_cats] == k_cat_comb
+        m_cat = m_sub[(m_cat_index).all(axis=1)]
+        m_cat_pca = m_sub_pca[(m_cat_index).all(axis=1)]
+        
+        # If there is no suitable match for the pre-project luc time series
+        # Then it may be preferable to just take the luc state at t0
+        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)]
+        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)]
+        # For now if there are no matches return nothing
+        
+        if(m_cat.shape[0] < k_cat.shape[0] * 5):
+            matchless.append(k_cat)
+            continue
+        
+        matches_index = loop_match(m_cat_pca, k_cat_pca)
+        m_cat_matches = m_cat.iloc[matches_index]
+        
+        # Join the pairs into one dataframe:
+        k_cat = k_cat.reset_index(drop = True)
+        m_cat_matches = m_cat_matches.reset_index(drop = True)  
+        pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1)
+        # Append the resulting DataFrame to the list
+        pairs.append(pairs_df)
+    
+    return (pairs, matchless)
 
 def find_pairs(
     k_parquet_filename: str,
     m_parquet_filename: str,
     start_year: int,
+    luc_match: bool,
     seed: int,
     output_folder: str,
     processes_count: int
 ) -> None:
+    logging.info("Loading K from %s", k_parquet_filename)
+    k_pixels = pd.read_parquet(k_parquet_filename)
+    logging.info("Loading M from %s", m_parquet_filename)
+    m_pixels = pd.read_parquet(m_parquet_filename)
+    # concat m and k
+    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
+                        m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
+    
+    # Extract only the continuous columns
+    km_pixels_distance = km_pixels[DISTANCE_COLUMNS]
+    # PCA transform and conversion to 32 bit ints
+    logging.info("Transforming continuous variables to PCA space")
+    km_pca = to_pca_int32(km_pixels_distance)
+    # Find clusters using Kmeans
+    logging.info("Starting cluster assignment using kmeans")
+    # Initialize the KMeans object
+    kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True)
+    # Perform clustering
+    kmeans.train(km_pca)
+    # Get cluster assignments
+    km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten()
+    
     logging.info("Starting find pairs")
     os.makedirs(output_folder, exist_ok=True)
-
+    
     rng = np.random.default_rng(seed)
     iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING))
-
+    
     with Pool(processes=processes_count) as pool:
         pool.map(
             partial(
                 find_match_iteration,
-                k_parquet_filename,
-                m_parquet_filename,
+                km_pixels,
+                km_pca,
                 start_year,
+                luc_match,
                 output_folder
             ),
             iteration_seeds
@@ -346,6 +287,13 @@ def main():
         dest="start_year",
         help="Year project started."
     )
+    parser.add_argument(
+        "--luc_match",
+        type=bool,
+        required=True,
+        dest="luc_match",
+        help="Boolean determines whether matching should include LUCs."
+    )
     parser.add_argument(
         "--seed",
         type=int,
@@ -374,6 +322,7 @@ def main():
         args.k_filename,
         args.m_filename,
         args.start_year,
+        args.luc_match,
         args.seed,
         args.output_directory_path,
         args.processes_count

From 4a7abcec0f6af5838065e0650142db6e537929c8 Mon Sep 17 00:00:00 2001
From: swinersha <twswinfield@gmai.com>
Date: Wed, 7 Aug 2024 13:34:41 +0000
Subject: [PATCH 3/6] fix: sppeds up find pairs by sampling M to the size of K

---
 .../cluster_find_pairs_interactive.py         | 303 +++---------------
 methods/matching/find_pairs.py                |  54 ++--
 2 files changed, 80 insertions(+), 277 deletions(-)

diff --git a/methods/matching/cluster_find_pairs_interactive.py b/methods/matching/cluster_find_pairs_interactive.py
index 9ac25fb..215722b 100644
--- a/methods/matching/cluster_find_pairs_interactive.py
+++ b/methods/matching/cluster_find_pairs_interactive.py
@@ -7,6 +7,7 @@
 from sklearn.cluster import KMeans
 import matplotlib.pyplot as plt
 import geopandas as gpd
+from pyproj import Proj, transform
 import os
 import time
 import sys
@@ -108,14 +109,53 @@ def calculate_smd(group1, group2):
 
 # Define the start year
 t0 = 2012 # READ THIS IN
-match_years = [t0-10, t0-5, t0]
 
 # Read in the data
 boundary = gpd.read_file('/maps/aew85/projects/1201.geojson')
 
-k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
+k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k.parquet')
+# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
 m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/1201/matches.parquet')
 
+
+t0 = 2018
+boundary = gpd.read_file('/maps/aew85/projects/ona.geojson')
+k_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet')
+m_pixels = pd.read_parquet('/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet')
+
+if(m_pixels.shape[0] > (k_pixels.shape[0])):
+    m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points    
+    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+    m_pixels = m_pixels.iloc[m_random_indices]
+
+# # Calculate the central coordinates (centroid)
+# central_lat = m_pixels['lat'].mean()
+# central_lon = m_pixels['lng'].mean()
+# aeqd_proj = f"+proj=aeqd +lat_0={central_lat} +lon_0={central_lon} +datum=WGS84"
+
+# # Convert the DataFrame to a GeoDataFrame
+# m_gdf = gpd.GeoDataFrame(m_pixels, geometry=gpd.points_from_xy(m_pixels.lng, m_pixels.lat))
+# # Set the original CRS to WGS84 (EPSG:4326)
+# m_gdf.set_crs(epsg=4326, inplace=True)
+
+# # Transform the GeoDataFrame to the AEQD projection
+# m_gdf_aeqd = m_gdf.to_crs(aeqd_proj)
+
+# # Extract the transformed coordinates
+# gdf_aeqd['aeqd_x'] = gdf_aeqd.geometry.x
+# gdf_aeqd['aeqd_y'] = gdf_aeqd.geometry.y
+
+# # Define the grid resolution in meters
+# grid_resolution_m = 5000  # 5 km
+
+# # Calculate grid cell indices
+# gdf_aeqd['grid_x'] = (gdf_aeqd['aeqd_x'] // grid_resolution_m).astype(int)
+# gdf_aeqd['grid_y'] = (gdf_aeqd['aeqd_y'] // grid_resolution_m).astype(int)
+
+# # Print the first few rows to verify
+# print(gdf_aeqd.head())
+
+
 # concat m and k
 km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
                        m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
@@ -124,8 +164,6 @@ def calculate_smd(group1, group2):
 exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
 exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
 
-match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
-
 # Extract only the continuous columns
 continuous_columns = km_pixels.columns.difference(exclude_columns)
 km_pixels_selected = km_pixels[continuous_columns]
@@ -178,7 +216,7 @@ def calculate_smd(group1, group2):
     plt.ylabel('PCA Component 2')
     plt.legend()
     plt.show()
-    plt.savefig('Figures/cluster_centres_faiss_1.png')
+    plt.savefig('Figures/ona_cluster_centres_faiss_1.png')
     plt.close()  # Close the plot to free up memory
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -202,11 +240,14 @@ def calculate_smd(group1, group2):
         fig.delaxes(axes[j])
     
     plt.tight_layout()
-    plt.savefig('Figures/cluster_faiss_1_facet.png')
+    plt.savefig('Figures/Ona_cluster_faiss_1_facet.png')
     plt.close()
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+match_years = [t0-10, t0-5, t0]
+match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
+
 # Extract K and M pixels 
 k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
 m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
@@ -216,7 +257,7 @@ def calculate_smd(group1, group2):
 m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
 
 k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
-m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
+m_sub_size = int(m_pixels.shape[0] * 1)
 
 # Define indexs for the samples from K and M
 k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
@@ -330,252 +371,4 @@ def calculate_smd(group1, group2):
 smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
 print(smd_df)
 
-smd_results = []
-for column in columns_to_compare:
-    smd, mean1, mean2, pooled_std = calculate_smd(k_pixels[column], m_pixels[column])
-    smd_results.append((column, smd, mean1, mean2, pooled_std))
 
-# Convert the results to a DataFrame for better readability
-smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
-print(smd_df)
-
-
-
-
-
-
-
-def find_match_iteration(
-    k_parquet_filename: str,
-    m_parquet_filename: str,
-    start_year: int,
-    output_folder: str,
-    idx_and_seed: tuple[int, int]
-) -> None:
-    logging.info("Find match iteration %d of %d", idx_and_seed[0] + 1, REPEAT_MATCH_FINDING)
-    rng = np.random.default_rng(idx_and_seed[1])
-
-    logging.info("Loading K from %s", k_parquet_filename)
-    k_pixels = pd.read_parquet(k_parquet_filename)
-    
-    logging.info("Loading M from %s", m_parquet_filename)
-    m_pixels = pd.read_parquet(m_parquet_filename)
-    
-    # concat m and k
-    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
-                           m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
-    
-    # Find the continuous columns
-    exclude_columns = ['ID', 'x', 'y', 'lat', 'lng', 'country', 'ecoregion', 'trt']
-    exclude_columns += [col for col in km_pixels.columns if col.startswith('luc')]
-    continuous_columns = km_pixels.columns.difference(exclude_columns)
-    # Categorical columns
-    match_cats = ["ecoregion", "country", "cluster"] + ["luc_" + str(year) for year in match_years]
-    
-    logging.info("Starting PCA transformation of k and m union. km_pixels.shape: %a", {km_pixels.shape})
-    # PCA transform and conversion to 32 bit ints for continuous only
-    km_pca = to_pca_int32(km_pixels[continuous_columns])
-    logging.info("Done PCA transformation")
-    
-    # Extract K and M pixels  - this might be unnecessary I just wanted to make sure 
-    # K and M were in the same order here and in the PCA transform
-    k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
-    m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
-    # Extract K and M PCA transforms
-    k_pca = km_pca.loc[km_pixels['trt'] == 'trt'].to_numpy()
-    m_pca = km_pca.loc[km_pixels['trt'] == 'ctrl'].to_numpy()
-    
-    # Sample from K and M
-    k_sub_size = int(k_pixels.shape[0]* K_SUB_PROPORTION)
-    m_sub_size = int(m_pixels.shape[0] * M_SUB_PROPORTION)
-    # Define indexs for the samples from K and M
-    k_random_indices = np.random.choice(k_pixels.shape[0], size=k_sub_size, replace=False)
-    m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
-    # Take random samples from K and M pixels
-    k_sub = k_pixels.iloc[k_random_indices]
-    m_sub = m_pixels.iloc[m_random_indices]
-    # Take corresponding random samples from the PCA transformed K and M 
-    k_sub_pca = k_pca[k_random_indices,:]
-    m_sub_pca = m_pca[m_random_indices,:]
-    
-    logging.info("Samples taken from K and M. k_sub.shape: %a; m_sub.shape: %a", {k_sub.shape, m_sub.shape})
-    
-    # Identify the unique combinations of luc columns
-    k_cat_combinations = k_sub[match_cats].drop_duplicates().sort_values(by=match_cats, ascending=[True] * len(match_cats))
-    
-    pairs_list = []
-    matchless_list = []
-    
-    logging.info("Starting greedy matching... k_sub.shape: %s, m_sub.shape: %s",
-                                  k_sub.shape, m_sub.shape)
-    
-    start_time = time.time()
-    for i in range(0, k_cat_combinations.shape[0]):
-        # i = 6 # ith element of the unique combinations of the luc time series in k
-        # for in range()
-        k_cat_comb = k_cat_combinations.iloc[i]
-        k_cat = k_sub[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
-        k_cat_pca = k_sub_pca[(k_sub[match_cats] == k_cat_comb).all(axis=1)]
-        
-        # Find the subset in km_pixels that matches this combination
-        m_cat = m_sub[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
-        m_cat_pca = m_sub_pca[(m_sub[match_cats] == k_cat_comb).all(axis=1)]
-        
-        if VERBOSE:
-            print('ksub_cat:' + str(k_cat.shape[0]))
-            print('msub_cat:' + str(m_cat.shape[0]))
-        
-        # If there is no suitable match for the pre-project luc time series
-        # Then it may be preferable to just take the luc state at t0
-        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[1:3]] == K_luc_comb[1:3]).all(axis=1)]
-        # m_luc_comb = m_pixels[(m_pixels[match_luc_years[2:3]] == K_luc_comb[2:3]).all(axis=1)]
-        # For if there are no matches return nothing
-        
-        if(m_cat.shape[0] < k_cat.shape[0] * 5):
-            # print("M insufficient for matching. Set VERBOSE to True for more details.")
-            # Append the matchless DataFrame to the list
-            matchless_list.append(k_cat)
-            continue
-        
-        # Find the matches
-        matches_index = loop_match(m_cat_pca, k_cat_pca)
-        m_cat_matches = m_cat.iloc[matches_index]
-        
-        # i = 0
-        # matched = pd.concat([k_cat.iloc[i], m_cat.iloc[matches[i]]], axis=1, ignore_index=True)
-        # matched.columns = ['trt', 'ctrl']
-        # matched
-        #Looks great!
-        columns_to_compare = ['access', 'cpc0_d', 'cpc0_u', 'cpc10_d', 'cpc10_u', 'cpc5_d', 'cpc5_u', 'elevation', 'slope']
-        # Calculate SMDs for the specified columns
-        smd_results = []
-        for column in columns_to_compare:
-            smd, mean1, mean2, pooled_std = calculate_smd(k_cat[column], m_cat_matches[column])
-            smd_results.append((column, smd, mean1, mean2, pooled_std))
-        
-        # Convert the results to a DataFrame for better readability
-        smd_df = pd.DataFrame(smd_results, columns=['Variable', 'SMD', 'Mean_k_cat', 'Mean_m_cat', 'Pooled_std'])
-        
-        if VERBOSE:
-            # Print the results
-            print("categorical combination:")
-            print(k_cat_comb)
-            # Count how many items in 'column1' are not equal to the specified integer value
-            print("LUC flips in K:")
-            (k_cat['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
-            print("LUC flips in matches:")
-            (m_cat_matches['luc_2022'] != k_cat_comb['luc_' + str(t0)]).sum()
-            print("Standardized Mean Differences:")
-            print(smd_df)
-        
-        # Join the pairs into one dataframe:
-        k_cat = k_cat.reset_index(drop = True)
-        m_cat_matches = m_cat_matches.reset_index(drop = True)  
-        pairs_df = pd.concat([k_cat.add_prefix('k_'), m_cat_matches.add_prefix('s_')], axis=1)
-        
-        # Append the resulting DataFrame to the list
-        pairs_list.append(pairs_df)
-
-    # Combine all the DataFrames in the list into a single DataFrame
-    pairs = pd.concat(pairs_list, ignore_index=True)
-    matchless = pd.concat(matchless_list, ignore_index=True)
-    
-    logging.info("Finished greedy matching... pairs.shape: %s, matchless.shape: %s",
-                                  pairs.shape, matchless.shape)
-    
-    logging.info("Starting storing matches...")
-    pairs.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}.parquet'))
-    matchless.to_parquet(os.path.join(output_folder, f'{idx_and_seed[1]}_matchless.parquet'))
-    
-    logging.info("Finished find match iteration")
-
-
-def find_pairs(
-    k_parquet_filename: str,
-    m_parquet_filename: str,
-    start_year: int,
-    seed: int,
-    output_folder: str,
-    processes_count: int
-) -> None:
-    logging.info("Starting find pairs")
-    os.makedirs(output_folder, exist_ok=True)
-
-    rng = np.random.default_rng(seed)
-    iteration_seeds = zip(range(REPEAT_MATCH_FINDING), rng.integers(0, 1000000, REPEAT_MATCH_FINDING))
-
-    with Pool(processes=processes_count) as pool:
-        pool.map(
-            partial(
-                find_match_iteration,
-                k_parquet_filename,
-                m_parquet_filename,
-                start_year,
-                output_folder
-            ),
-            iteration_seeds
-        )
-
-
-def main():
-    # If you use the default multiprocess model then you risk deadlocks when logging (which we
-    # have hit). Spawn is the default on macOS, but not on Linux.
-    set_start_method("spawn")
-
-    parser = argparse.ArgumentParser(description="Takes K and M and finds 100 sets of matches.")
-    parser.add_argument(
-        "--k",
-        type=str,
-        required=True,
-        dest="k_filename",
-        help="Parquet file containing pixels from K as generated by calculate_k.py"
-    )
-    parser.add_argument(
-        "--m",
-        type=str,
-        required=True,
-        dest="m_filename",
-        help="Parquet file containing pixels from M as generated by build_m_table.py"
-    )
-    parser.add_argument(
-        "--start_year",
-        type=int,
-        required=True,
-        dest="start_year",
-        help="Year project started."
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        required=True,
-        dest="seed",
-        help="Random number seed, to ensure experiments are repeatable."
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        required=True,
-        dest="output_directory_path",
-        help="Directory into which output matches will be written. Will be created if it does not exist."
-    )
-    parser.add_argument(
-        "-j",
-        type=int,
-        required=False,
-        default=round(cpu_count() / 2),
-        dest="processes_count",
-        help="Number of concurrent threads to use."
-    )
-    args = parser.parse_args()
-
-    find_pairs(
-        args.k_filename,
-        args.m_filename,
-        args.start_year,
-        args.seed,
-        args.output_directory_path,
-        args.processes_count
-    )
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py
index 62e9e9c..e253014 100644
--- a/methods/matching/find_pairs.py
+++ b/methods/matching/find_pairs.py
@@ -20,11 +20,15 @@
 # m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/tom_pairs/matches.parquet'
 # luc_match = True
 
+t0 = 2018
+k_parquet_filename = '/maps/aew85/tmf_pipe_out/fastfp_test_ona/k.parquet'
+m_parquet_filename = '/maps/aew85/tmf_pipe_out/fastfp_test_ona/matches.parquet'
+
 REPEAT_MATCH_FINDING = 100
 DEBUG = False
 
 K_SUB_PROPORTION = 0.01
-M_SUB_PROPORTION = 0.1
+M_SUB_PROPORTION = 1
 # Number of clusters
 NUM_CLUSTERS = 9
 # Number of iterations for K means fitting
@@ -98,8 +102,8 @@ def calculate_smd(group1, group2):
     return smd, mean1, mean2, pooled_std
 
 def find_match_iteration(
-    km_pixels: pd.DataFrame,
-    km_pca: np.ndarray,
+    k_pixels: pd.DataFrame,
+    m_pixels: pd.DataFrame,
     start_year: int,
     luc_match: bool,
     output_folder: str,
@@ -115,6 +119,29 @@ def find_match_iteration(
     else:
         match_cats = ["ecoregion", "country", "cluster"]
     
+    if(m_pixels.shape[0] > (k_pixels.shape[0])):
+        m_sub_size = int(k_pixels.shape[0]) # First down sample M as it is ~230 million points    
+        m_random_indices = np.random.choice(m_pixels.shape[0], size=m_sub_size, replace=False)
+        m_pixels = m_pixels.iloc[m_random_indices]
+    
+    # concat m and k
+    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
+                        m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
+    
+    # Extract only the continuous columns
+    km_pixels_distance = km_pixels[DISTANCE_COLUMNS]
+    # PCA transform and conversion to 32 bit ints
+    logging.info("Transforming continuous variables to PCA space")
+    km_pca = to_pca_int32(km_pixels_distance)
+    # Find clusters using Kmeans
+    logging.info("Starting cluster assignment using kmeans")
+    # Initialize the KMeans object
+    kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True)
+    # Perform clustering
+    kmeans.train(km_pca)
+    # Get cluster assignments
+    km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten()
+    
     # Extract K and M pixels 
     k_pixels = km_pixels.loc[km_pixels['trt'] == 'trt']
     m_pixels = km_pixels.loc[km_pixels['trt'] == 'ctrl']
@@ -223,23 +250,6 @@ def find_pairs(
     k_pixels = pd.read_parquet(k_parquet_filename)
     logging.info("Loading M from %s", m_parquet_filename)
     m_pixels = pd.read_parquet(m_parquet_filename)
-    # concat m and k
-    km_pixels = pd.concat([k_pixels.assign(trt='trt', ID=range(0, len(k_pixels))),
-                        m_pixels.assign(trt='ctrl', ID=range(0, len(m_pixels)))], ignore_index=True)
-    
-    # Extract only the continuous columns
-    km_pixels_distance = km_pixels[DISTANCE_COLUMNS]
-    # PCA transform and conversion to 32 bit ints
-    logging.info("Transforming continuous variables to PCA space")
-    km_pca = to_pca_int32(km_pixels_distance)
-    # Find clusters using Kmeans
-    logging.info("Starting cluster assignment using kmeans")
-    # Initialize the KMeans object
-    kmeans = faiss.Kmeans(d=km_pca.shape[1], k=NUM_CLUSTERS, niter=NUM_ITERATIONS, verbose=True)
-    # Perform clustering
-    kmeans.train(km_pca)
-    # Get cluster assignments
-    km_pixels['cluster'] = kmeans.index.search(km_pca, 1)[1].flatten()
     
     logging.info("Starting find pairs")
     os.makedirs(output_folder, exist_ok=True)
@@ -251,8 +261,8 @@ def find_pairs(
         pool.map(
             partial(
                 find_match_iteration,
-                km_pixels,
-                km_pca,
+                k_pixels,
+                m_pixels,
                 start_year,
                 luc_match,
                 output_folder

From 81377a6217247dd8635223a4cac2bb66785c50de Mon Sep 17 00:00:00 2001
From: Abby Williams <149403544+abbyevewilliams@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:42:37 +0100
Subject: [PATCH 4/6] Update calculate_k.py

Removed the problematic lines for ecoregion and access
---
 methods/matching/calculate_k.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py
index dba2284..60ad146 100644
--- a/methods/matching/calculate_k.py
+++ b/methods/matching/calculate_k.py
@@ -73,7 +73,6 @@ def build_layer_collection(
     #     RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in
     #         glob.glob("*.tif", root_dir=ecoregions_directory_path)
     # ], name="ecoregions")
-    ecoregions = RasterLayer.layer_from_file(ecoregions_directory_path)
 
     elevation = GroupLayer([
         RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in
@@ -88,7 +87,6 @@ def build_layer_collection(
     #     RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in
     #         glob.glob("*.tif", root_dir=access_directory_path)
     # ], name="access")
-    access = RasterLayer.layer_from_file(access_directory_path)
 
     countries = RasterLayer.layer_from_file(countries_raster_filename)
 

From 2bec9d97789b45c077481591c0c2465d9fee7b66 Mon Sep 17 00:00:00 2001
From: Abby Williams <149403544+abbyevewilliams@users.noreply.github.com>
Date: Thu, 8 Aug 2024 12:46:30 +0100
Subject: [PATCH 5/6] Update calculate_k.py

Uncommented lines needed for ecoregion and access
---
 methods/matching/calculate_k.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/methods/matching/calculate_k.py b/methods/matching/calculate_k.py
index 60ad146..a5fa156 100644
--- a/methods/matching/calculate_k.py
+++ b/methods/matching/calculate_k.py
@@ -69,10 +69,10 @@ def build_layer_collection(
 
     # ecoregions is such a heavy layer it pays to just rasterize it once - we should possibly do this once
     # as part of import of the ecoregions data
-    # ecoregions = GroupLayer([
-    #     RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in
-    #         glob.glob("*.tif", root_dir=ecoregions_directory_path)
-    # ], name="ecoregions")
+    ecoregions = GroupLayer([
+        RasterLayer.layer_from_file(os.path.join(ecoregions_directory_path, filename)) for filename in
+            glob.glob("*.tif", root_dir=ecoregions_directory_path)
+    ], name="ecoregions")
 
     elevation = GroupLayer([
         RasterLayer.layer_from_file(os.path.join(elevation_directory_path, filename)) for filename in
@@ -83,10 +83,10 @@ def build_layer_collection(
             glob.glob("slope*.tif", root_dir=slope_directory_path)
     ], name="slopes")
 
-    # access = GroupLayer([
-    #     RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in
-    #         glob.glob("*.tif", root_dir=access_directory_path)
-    # ], name="access")
+    access = GroupLayer([
+        RasterLayer.layer_from_file(os.path.join(access_directory_path, filename)) for filename in
+            glob.glob("*.tif", root_dir=access_directory_path)
+    ], name="access")
 
     countries = RasterLayer.layer_from_file(countries_raster_filename)
 

From fa83f7ade3c632a9fb5783230f39cfae7c4c7c7e Mon Sep 17 00:00:00 2001
From: swinersha <twswinfield@gmai.com>
Date: Mon, 12 Aug 2024 15:17:21 +0000
Subject: [PATCH 6/6] feat: code to tile M into spatial cells added

---
 methods/matching/tile_m.py | 111 +++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 methods/matching/tile_m.py

diff --git a/methods/matching/tile_m.py b/methods/matching/tile_m.py
new file mode 100644
index 0000000..8df0064
--- /dev/null
+++ b/methods/matching/tile_m.py
@@ -0,0 +1,111 @@
+
+import pandas as pd
+import numpy as np
+from pyproj import Proj
+from numba import jit
+import time
+import os
+
+grid_resolution_m = 5000
+
+# Read in the data
+m_parquet_filename = '/maps/aew85/tmf_pipe_out/1201/matches.parquet'
+
+m_parquet_dirname = os.path.dirname(m_parquet_filename)
+
+k_pixels = pd.read_parquet(k_parquet_filename)
+# k_pixels = pd.read_parquet('/maps/tws36/tmf_pipe_out/1201/k_all.parquet')
+m_pixels = pd.read_parquet(m_parquet_filename)
+
+def assign_cells(aeqd_x, aeqd_y):
+    grid_x = (aeqd_x // grid_resolution_m).astype(int)
+    grid_y = (aeqd_y // grid_resolution_m).astype(int)
+    return grid_x, grid_y
+
+
+# Create the directory if it doesn't exist
+output_dir = m_parquet_dirname + '/m_tiles'
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+# Define the AEQD projection
+lat_mean = m_pixels['lat'].mean()
+lng_mean = m_pixels['lng'].mean()
+aeqd_proj = Proj(proj='aeqd', lat_0=lat_mean, lon_0=lng_mean, datum='WGS84')
+
+# Vectorized transformation using pyproj (no need for a loop)
+m_pixels['aeqd_x'], m_pixels['aeqd_y'] = aeqd_proj(m_pixels['lng'].values, m_pixels['lat'].values)
+m_pixels['grid_x'], m_pixels['grid_y'] = assign_cells(m_pixels['aeqd_x'], m_pixels['aeqd_y'])
+
+print(m_pixels[['grid_x', 'grid_y']])
+
+# Create a unique identifier for each grid cell
+m_pixels['grid_id'] = m_pixels['grid_x'].astype(str) + '_' + m_pixels['grid_y'].astype(str)
+# Check the number of unique grid cells
+print(f"Number of unique grid cells: {m_pixels['grid_id'].nunique()}")
+
+# # Create a color palette with a unique color for each grid_id
+# unique_grid_ids = m_pixels['grid_id'].unique()
+# palette = sns.color_palette("husl", len(unique_grid_ids))
+# color_dict = dict(zip(unique_grid_ids, palette))
+
+# # Map the grid_id to a color
+# m_pixels['color'] = m_pixels['grid_id'].map(color_dict)
+
+# # Plot the points
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+# plt.figure(figsize=(10, 8))
+# plt.scatter(m_pixels['lng'], m_pixels['lat'], c=m_pixels['color'], s=1, alpha=0.6)
+# plt.title('Points Colored by Unique Grid Cell')
+# plt.xlabel('Longitude')
+# plt.ylabel('Latitude')
+# plt.savefig('Figures/Gola_m_grid_cells.png')
+# plt.close()
+
+# Loop through each unique grid_id
+for grid_id in m_pixels['grid_id'].unique():
+    # Filter the data for this grid_id
+    grid_data = m_pixels[m_pixels['grid_id'] == grid_id]
+    
+    # Create a unique file name
+    file_name = f"m_tile_{grid_id}.parquet"
+    file_path = os.path.join(output_dir, file_name)
+    
+    # Save the DataFrame as a parquet file
+    grid_data.to_parquet(file_path, index=False)
+    print(f"Saved {file_name} with {len(grid_data)} records.")
+
+
+#-----------------------------------------
+# Code to move to find pairs
+# just testing for now
+
+
+
+import random
+
+# Directory containing the Parquet files
+input_dir = m_parquet_dirname + '/m_tiles'
+
+# List all Parquet files in the directory
+all_files = [f for f in os.listdir(input_dir) if f.endswith('.parquet')]
+
+# Determine the number of files to read (10% of total files)
+num_files_to_read = max(1, int(0.1 * len(all_files)))  # Ensure at least 1 file is read
+
+# Randomly select 10% of the files
+selected_files = random.sample(all_files, num_files_to_read)
+
+# Read the selected files and store them in a list of DataFrames
+dfs = []
+for file_name in selected_files:
+    file_path = os.path.join(input_dir, file_name)
+    df = pd.read_parquet(file_path)
+    dfs.append(df)
+
+# Concatenate all the DataFrames into a single DataFrame
+combined_df = pd.concat(dfs, ignore_index=True)
+
+# Print information about the loaded data
+print(f"Loaded {len(dfs)} files with a total of {len(combined_df)} records.")
\ No newline at end of file