import pandas as pd import numpy as np from pathlib import Path def combine_databases(): # Define paths aggregated_data_path = Path("aggregated_data") db_update_bio_path = Path("db_update") biorxiv_embeddings_path = Path("biorxiv_ubin_embaddings.npy") embed_update_bio_path = Path("embed_update") db_update_med_path = Path("db_update_med") embed_update_med_path = Path("embed_update_med") # Load existing database and embeddings for BioRxiv df_bio_existing = pd.read_parquet(aggregated_data_path) bio_embeddings_existing = np.load(biorxiv_embeddings_path, allow_pickle=True) print(f"Existing BioRxiv data shape: {df_bio_existing.shape}, Existing BioRxiv embeddings shape: {bio_embeddings_existing.shape}") # Determine the embedding size from existing embeddings embedding_size = bio_embeddings_existing.shape[1] # Prepare lists to collect new updates bio_dfs_list = [] bio_embeddings_list = [] # Helper function to process updates from a specified directory def process_updates(new_data_directory, updated_embeddings_directory, dfs_list, embeddings_list): new_data_files = sorted(Path(new_data_directory).glob("*.parquet")) for data_file in new_data_files: corresponding_embedding_file = Path(updated_embeddings_directory) / (data_file.stem + ".npy") if corresponding_embedding_file.exists(): df = pd.read_parquet(data_file) new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True) # Check if the number of rows in the DataFrame matches the number of rows in the embeddings if df.shape[0] != new_embeddings.shape[0]: print(f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.") continue # Check embedding size and adjust if necessary if new_embeddings.shape[1] != embedding_size: print(f"Skipping {data_file.name} due to embedding size mismatch.") continue dfs_list.append(df) embeddings_list.append(new_embeddings) else: print(f"No corresponding embedding file found for {data_file.name}") # Process updates from both BioRxiv and MedRxiv process_updates(db_update_bio_path, embed_update_bio_path, bio_dfs_list, bio_embeddings_list) # Concatenate all BioRxiv updates if bio_dfs_list: df_bio_updates = pd.concat(bio_dfs_list) else: df_bio_updates = pd.DataFrame() if bio_embeddings_list: bio_embeddings_updates = np.vstack(bio_embeddings_list) else: bio_embeddings_updates = np.array([]) # Append new BioRxiv data to existing, handling duplicates as needed df_bio_combined = pd.concat([df_bio_existing, df_bio_updates]) # Create a mask for filtering unique titles bio_mask = ~df_bio_combined.duplicated(subset=["title"], keep="last") df_bio_combined = df_bio_combined[bio_mask] # Combine BioRxiv embeddings, ensuring alignment with the DataFrame bio_embeddings_combined = ( np.vstack([bio_embeddings_existing, bio_embeddings_updates]) if bio_embeddings_updates.size else bio_embeddings_existing ) # Filter the embeddings based on the DataFrame unique entries bio_embeddings_combined = bio_embeddings_combined[bio_mask] assert df_bio_combined.shape[0] == bio_embeddings_combined.shape[0], "Shape mismatch between BioRxiv DataFrame and embeddings" print(f"Filtered BioRxiv DataFrame shape: {df_bio_combined.shape}") print(f"Filtered BioRxiv embeddings shape: {bio_embeddings_combined.shape}") # Save combined BioRxiv DataFrame and embeddings combined_biorxiv_data_path = aggregated_data_path / "combined_biorxiv_data.parquet" df_bio_combined.to_parquet(combined_biorxiv_data_path) print(f"Saved combined BioRxiv DataFrame to {combined_biorxiv_data_path}") combined_biorxiv_embeddings_path = "biorxiv_ubin_embaddings.npy" np.save(combined_biorxiv_embeddings_path, bio_embeddings_combined) print(f"Saved combined BioRxiv embeddings to {combined_biorxiv_embeddings_path}") # Prepare lists to collect new MedRxiv updates med_dfs_list = [] med_embeddings_list = [] process_updates(db_update_med_path, embed_update_med_path, med_dfs_list, med_embeddings_list) # Concatenate all MedRxiv updates if med_dfs_list: df_med_combined = pd.concat(med_dfs_list) else: df_med_combined = pd.DataFrame() if med_embeddings_list: med_embeddings_combined = np.vstack(med_embeddings_list) else: med_embeddings_combined = np.array([]) last_date_in_med_database = df_med_combined['date'].max() if not df_med_combined.empty else "unknown" # Create a mask for filtering unique titles med_mask = ~df_med_combined.duplicated(subset=["title"], keep="last") df_med_combined = df_med_combined[med_mask] med_embeddings_combined = med_embeddings_combined[med_mask] assert df_med_combined.shape[0] == med_embeddings_combined.shape[0], "Shape mismatch between MedRxiv DataFrame and embeddings" print(f"Filtered MedRxiv DataFrame shape: {df_med_combined.shape}") print(f"Filtered MedRxiv embeddings shape: {med_embeddings_combined.shape}") # Save combined MedRxiv DataFrame and embeddings combined_medrxiv_data_path = db_update_med_path / f"database_{last_date_in_med_database}.parquet" df_med_combined.to_parquet(combined_medrxiv_data_path) print(f"Saved combined MedRxiv DataFrame to {combined_medrxiv_data_path}") combined_medrxiv_embeddings_path = embed_update_med_path / f"database_{last_date_in_med_database}.npy" np.save(combined_medrxiv_embeddings_path, med_embeddings_combined) print(f"Saved combined MedRxiv embeddings to {combined_medrxiv_embeddings_path}") if __name__ == "__main__": combine_databases()