diff --git a/Sample_Manager/Script.ipynb b/Sample_Manager/Script.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..557ddbf47e9d12e03f52de80a4363c042bb50888 --- /dev/null +++ b/Sample_Manager/Script.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f154826c", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e650569d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import labkey\n", + "from labkey.api_wrapper import APIWrapper\n", + "import pandas as pd\n", + "import json\n", + "import urllib3\n", + "import urllib\n", + "import os\n" + ] + }, + { + "cell_type": "markdown", + "id": "df672b72", + "metadata": {}, + "source": [ + "## Project Configuration and LabKey API Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f599db6", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Define project and LabKey server details\n", + "PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'\n", + "LABKEY_SERVER = \"labkey-pro-dev.scicore.unibas.ch\"\n", + "CONTEXT_PATH = '' # Use 'labkey' for main server\n", + "\n", + "# Initialize LabKey API Wrapper\n", + "api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)\n" + ] + }, + { + "cell_type": "markdown", + "id": "3def1a57", + "metadata": {}, + "source": [ + "## Authentication Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "78ec9c0a", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Path to .netrc file for authentication\n", + "NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')\n", + "\n", + "# Verify and read .netrc file\n", + "if not os.path.isfile(NETRC_FILE):\n", + " raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')\n", + "\n", + "# Extract login credentials\n", + "netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])\n", + "login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]\n", + "password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]\n", + "\n", + "# Authentication headers\n", + "headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5f58e20", + "metadata": {}, + "source": [ + "## Verify Project Directory" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cef2e4e8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "params = {\"includeSubfolders\": True, \"depth\": 1}\n", + "url = api.server_context.build_url(\"project\", \"getContainers.view\", container_path=PROJECT.replace(' ', '%20'))\n", + "\n", + "resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)\n", + "if resp.status_code == 404:\n", + " raise Exception(f'Project not found: {PROJECT}. Please create it first.')\n" + ] + }, + { + "cell_type": "markdown", + "id": "9203b282", + "metadata": {}, + "source": [ + "## Create and Populate Source Type 'Study'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0d422897", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/izarda0000/miniconda3/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default\n", + " warn(\"Workbook contains no default style, apply openpyxl's default\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success: Domain created for sample source: Study\n", + "Success: Data inserted into the DataClass: Study\n" + ] + } + ], + "source": [ + "# Define the source Excel file for study data\n", + "SOURCE_STUDY = 'Study.xlsx'\n", + "\n", + "# Read data from the Excel file\n", + "try:\n", + " df = pd.read_excel(SOURCE_STUDY)\n", + "except Exception as e:\n", + " print(f'Error reading Excel file {SOURCE_STUDY}: {e}')\n", + " exit(1)\n", + "\n", + "# Extract column names\n", + "columns = df.columns[1:].tolist()\n", + "\n", + "# Define LabKey fields for the DataClass\n", + "labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]\n", + "labkey_fields = [{\"name\": col, \"rangeURI\": \"string\"} for col in columns]\n", + "\n", + "# Define DataClass domain\n", + "study_domain_definition = {\n", + " \"kind\": \"DataClass\",\n", + " \"domainDesign\": {\n", + " \"name\": \"Study\",\n", + " \"fields\": labkey_fields\n", + " },\n", + " \"options\": {\n", + " \"category\": \"sources\"\n", + " }\n", + "}\n", + "\n", + "# Create the DataClass domain in LabKey\n", + "try:\n", + " created_dataclass_domain = api.domain.create(study_domain_definition)\n", + " print(\"Success: Domain created for sample source: Study\")\n", + "except Exception as e:\n", + " print(f'Error creating domain: {e}')\n", + " exit(1)\n", + "\n", + "# Insert data into the DataClass 'Study'\n", + "\n", + "sources_rows = []\n", + "# Add Samples of the Sample Type\n", + "for i, row in df.iterrows():\n", + " sources_rows.append(row[columns].fillna('').to_dict())\n", + " sources_rows[-1]['Name'] = row['SourceID']\n", + "\n", + "\n", + "# Insert data into the DataClass 'Study'\n", + "try:\n", + " insert_result = api.query.insert_rows(\"exp.data\", \"Study\", sources_rows)\n", + " print(\"Success: Data inserted into the DataClass: Study\")\n", + "except Exception as e:\n", + " print(f'Error inserting data: {e}')\n", + " exit(1)\n" + ] + }, + { + "cell_type": "markdown", + "id": "cee50632", + "metadata": {}, + "source": [ + "## Create and Populate Source Type 'Patient' (Linked to Study)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dfb86ebf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success: Domain created for sample source: Patient\n", + "Success: Data inserted into the DataClass: Patient\n" + ] + } + ], + "source": [ + "# Define the source Excel file for study data\n", + "SOURCE_PATIENT = 'Patient.xlsx'\n", + "\n", + "# Read data from the Excel file\n", + "try:\n", + " df = pd.read_excel(SOURCE_PATIENT)\n", + "except Exception as e:\n", + " print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')\n", + " exit(1)\n", + "\n", + "# Extract column names except for the last column SourceStudy and SourceID\n", + "columns = df.columns.tolist()[1:-1]\n", + "\n", + "# Define LabKey fields for the DataClass\n", + "labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]\n", + "labkey_fields = [{\"name\": col, \"rangeURI\": \"string\"} for col in columns]\n", + "\n", + "# Define DataClass domain\n", + "patient_domain_definition = {\n", + " \"kind\": \"DataClass\",\n", + " \"domainDesign\": {\n", + " \"name\": \"Patient\",\n", + " \"fields\": labkey_fields\n", + " },\n", + " \"options\": {\n", + " \"category\": \"sources\",\n", + " \"name\": \"Patient\",\n", + " \"importAliases\": {\n", + "\n", + " \"SourceStudy\": {\n", + "\n", + " \"inputType\": \"dataInputs/Study\"\n", + "\n", + " }\n", + "\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Create the DataClass domain in LabKey\n", + "try:\n", + " created_dataclass_domain = api.domain.create(patient_domain_definition)\n", + " print(\"Success: Domain created for sample source: Patient\")\n", + "except Exception as e:\n", + " print(f'Error creating domain: {e}')\n", + " exit(1)\n", + "\n", + "# Insert data into the DataClass 'Patient'\n", + "\n", + "sources_rows = []\n", + "\n", + "# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage \n", + "for i, row in df.iterrows():\n", + " sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())\n", + " sources_rows[-1]['Name'] = row['SourceID']\n", + "\n", + "\n", + "# Insert data into the DataClass 'Study'\n", + "try:\n", + " insert_result = api.query.insert_rows(\"exp.data\", \"Patient\", sources_rows)\n", + " print(\"Success: Data inserted into the DataClass: Patient\")\n", + "except Exception as e:\n", + " print(f'Error inserting data: {e}')\n", + " exit(1)" + ] + }, + { + "cell_type": "markdown", + "id": "36d27f01", + "metadata": {}, + "source": [ + "## Process Samples and Create Storage Hierarchy" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8a5bf323", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created box: 6522\n", + "Created box: 6523\n" + ] + } + ], + "source": [ + "SOURCE_SAMPLES = 'Samples.xlsx'\n", + "\n", + "# Read data from the Excel file\n", + "\n", + "try:\n", + " df = pd.read_excel(SOURCE_SAMPLES)\n", + "except Exception as e:\n", + " print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')\n", + " exit(1)\n", + "\n", + "# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier. \n", + "df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)\n", + "\n", + "# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass\n", + "# Also exclude Sample ID column. It will be renamed to \"Name\" column. \n", + "\n", + "columns = df.columns[1:-1].tolist()\n", + "columns\n", + "\n", + "# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location \n", + "# Note: Change the heirarchy according to the data in excel file\n", + "\n", + "df['Building'] = df['Location'].str.split('/').str[0]\n", + "df['Floor'] = df['Location'].str.split('/').str[1]\n", + "df['Freezer'] = df['Location'].str.split('/').str[2]\n", + "df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']\n", + "df['Shelf'] = df['Location'].str.split('/').str[3]\n", + "df['Rack'] = df['Location'].str.split('/').str[4]\n", + "df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]\n", + "df['Coordinates'] = df['Box'].str.split(':').str[-1]\n", + "df['Box'] = df['Box'].str.split(':').str[0]\n", + "\n", + "# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN\n", + "df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])\n", + "\n", + "df['StorageCol'] = None\n", + "df['StorageRow'] = None\n", + "df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)\n", + "df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]\n", + "\n", + "\n", + "# ### Create storage heierarchy in LabKey\n", + "\n", + "# Create Unit Type Box\n", + "try:\n", + " api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})\n", + "except:\n", + " pass\n", + "\n", + "result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})\n", + "box_type_id = result['data']['rowId']\n", + "\n", + "df['box_id'] = ''\n", + "# Physical Locations\n", + "for building in df['Building'].dropna().unique():\n", + " # 'Physical Location' -> \n", + " result_building = api.storage.create_storage_item(\"Physical Location\", {\"name\": building, \"description\": \"Building\"})\n", + " building_id = result_building['data']['rowId']\n", + " building_df = df.loc[df['Building'] == building]\n", + " for floor in building_df['Floor'].dropna().unique():\n", + " result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})\n", + " floor_id = result_floor['data']['rowId']\n", + " floor_df = building_df.loc[building_df['Floor'] == floor]\n", + " # Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage\n", + " for freezer in floor_df['Freezer'].dropna().unique()[:1]:\n", + " freezer_full = '{}-{}-{}'.format(building, floor, freezer)\n", + " freezer_desc = 'Liquid Nitrogen Room'\n", + " if 'Freezer' in freezer:\n", + " freezer_desc = 'Freezer'\n", + " result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})\n", + " freezer_id = result_freezer['data']['rowId']\n", + " freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]\n", + " # Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank\n", + " for shelf in freezer_df['Shelf'].dropna().unique():\n", + " shelf_desc = 'Shelf'\n", + " if 'tank' in shelf.lower():\n", + " shelf_desc = 'Tank'\n", + " result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})\n", + " shelf_id = result_shelf['data']['rowId']\n", + " shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]\n", + " # Labkey Terminology = Rack. Our Dataset = Rack / Tower\n", + " for rack in shelf_df['Rack'].dropna().unique():\n", + " rack_desc = 'Rack'\n", + " if 'tower' in rack.lower():\n", + " rack_desc = 'Tower'\n", + " result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})\n", + " rack_id = result_rack['data']['rowId']\n", + " rack_df = shelf_df.loc[shelf_df['Rack'] == rack]\n", + " for box in rack_df['Box'].dropna().unique():\n", + " result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})\n", + " box_id = result_box['data']['rowId']\n", + " box_df = rack_df.loc[rack_df['Box'] == box]\n", + " df.loc[box_df.index, 'box_id'] = box_id\n", + " df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)\n", + " print('Created box: {}'.format(box_id))" + ] + }, + { + "cell_type": "markdown", + "id": "530f62a9", + "metadata": {}, + "source": [ + "## Create Sample Types and Insert Samples into LabKey" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "105001ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Blood', 'Saliva']\n", + "Domain created for sample type: Blood\n", + "Domain created for sample type: Saliva\n", + "Inserted 40 rows into Blood domain.\n", + "Inserted 15 rows into Saliva domain.\n" + ] + } + ], + "source": [ + "sample_types = df['SampleType'].unique().tolist()\n", + "print(sample_types)\n", + "\n", + "# Loop over each sample type and create the domain\n", + "for sample_type in sample_types:\n", + " labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]\n", + " \n", + " for col in columns:\n", + " rangeURI = 'string'\n", + " labkey_fields.append({'name': col, 'rangeURI': rangeURI})\n", + " \n", + " sample_params = {\n", + " 'kind': 'SampleSet',\n", + " 'domainDesign': {\n", + " 'name': sample_type,\n", + " 'fields': labkey_fields,\n", + " \"domainKindName\": \"SampleSet\",\n", + " },\n", + " \"options\": {\n", + "\n", + " \"name\": sample_type,\n", + "\n", + " \"nameExpression\": \"S-${genId}\",\n", + "\n", + " \"aliquotNameExpression\": \"${${AliquotedFrom}-:withCounter}\",\n", + "\n", + " \"importAliases\": {\n", + "\n", + " \"SourcePatient\": {\n", + "\n", + " \"inputType\": \"dataInputs/Patient\"\n", + "\n", + " }\n", + " }\n", + " }\n", + " }\n", + " # Create domain using API call\n", + " sample_domain = api.domain.create(sample_params)\n", + " print(f\"Domain created for sample type: {sample_type}\")\n", + "\n", + "# Step 1: Prepare Sample Rows\n", + "sample_rows = []\n", + "# Add Samples of the Sample Type\n", + "for i, row in df.iterrows():\n", + " sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())\n", + " sample_rows[-1]['Name'] = row['SampleIdentifier']\n", + "\n", + " if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:\n", + " del sample_rows[-1]['StorageLocation']\n", + " del sample_rows[-1]['StorageRow']\n", + " del sample_rows[-1]['StorageCol']\n", + "\n", + "\n", + "# Step 2: Insert Rows\n", + "for sample_type in sample_types:\n", + " # Filter rows that match the current sample type\n", + " filtered_rows = [row for row in sample_rows if row.get(\"SampleType\") == sample_type]\n", + "\n", + " # Remove 'Type' column from each row before inserting\n", + " rows_to_insert = [{k: v for k, v in row.items() if k != \"SampleType\"} for row in filtered_rows]\n", + "\n", + " # Insert rows using the API\n", + " if rows_to_insert:\n", + " api.query.insert_rows(\"samples\", sample_type, rows_to_insert)\n", + " print(f\"Inserted {len(rows_to_insert)} rows into {sample_type} domain.\")" + ] + }, + { + "cell_type": "markdown", + "id": "b525f127", + "metadata": {}, + "source": [ + "## Optional Cleanup (Commented)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a92c91e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nfor sample_type in sample_types:\\n\\n drop_response = api.domain.drop(\"samples\", sample_type)\\n if \"success\" in drop_response:\\n print(\"The dataset domain was deleted.\")\\'\\n'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Delete all sample types and their data\n", + "\n", + "'''\n", + "for sample_type in sample_types:\n", + "\n", + " drop_response = api.domain.drop(\"samples\", sample_type)\n", + " if \"success\" in drop_response:\n", + " print(\"The dataset domain was deleted.\")'\n", + "'''" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Sample_Manager/Script.pdf b/Sample_Manager/Script.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f91058ea3472dd9398cd8fb6087245f6053f79c0 Binary files /dev/null and b/Sample_Manager/Script.pdf differ diff --git a/Sample_Manager/Script.py b/Sample_Manager/Script.py new file mode 100644 index 0000000000000000000000000000000000000000..912b8c6818c8aa37c1b9ab98fa07d9c4eb8b6666 --- /dev/null +++ b/Sample_Manager/Script.py @@ -0,0 +1,375 @@ +# %% [markdown] +# ## Import Libraries + +# %% + +import labkey +from labkey.api_wrapper import APIWrapper +import pandas as pd +import json +import urllib3 +import urllib +import os + + +# %% [markdown] +# ## Project Configuration and LabKey API Initialization + +# %% + +# Define project and LabKey server details +PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset' +LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch" +CONTEXT_PATH = '' # Use 'labkey' for main server + +# Initialize LabKey API Wrapper +api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True) + + +# %% [markdown] +# ## Authentication Setup + +# %% + +# Path to .netrc file for authentication +NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc') + +# Verify and read .netrc file +if not os.path.isfile(NETRC_FILE): + raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}') + +# Extract login credentials +netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value']) +login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0] +password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0] + +# Authentication headers +headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}') + + +# %% [markdown] +# ## Verify Project Directory + +# %% + +params = {"includeSubfolders": True, "depth": 1} +url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20')) + +resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True) +if resp.status_code == 404: + raise Exception(f'Project not found: {PROJECT}. Please create it first.') + + +# %% [markdown] +# ## Create and Populate Source Type 'Study' + +# %% +# Define the source Excel file for study data +SOURCE_STUDY = 'Study.xlsx' + +# Read data from the Excel file +try: + df = pd.read_excel(SOURCE_STUDY) +except Exception as e: + print(f'Error reading Excel file {SOURCE_STUDY}: {e}') + exit(1) + +# Extract column names +columns = df.columns[1:].tolist() + +# Define LabKey fields for the DataClass +labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] +labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns] + +# Define DataClass domain +study_domain_definition = { + "kind": "DataClass", + "domainDesign": { + "name": "Study", + "fields": labkey_fields + }, + "options": { + "category": "sources" + } +} + +# Create the DataClass domain in LabKey +try: + created_dataclass_domain = api.domain.create(study_domain_definition) + print("Success: Domain created for sample source: Study") +except Exception as e: + print(f'Error creating domain: {e}') + exit(1) + +# Insert data into the DataClass 'Study' + +sources_rows = [] +# Add Samples of the Sample Type +for i, row in df.iterrows(): + sources_rows.append(row[columns].fillna('').to_dict()) + sources_rows[-1]['Name'] = row['SourceID'] + + +# Insert data into the DataClass 'Study' +try: + insert_result = api.query.insert_rows("exp.data", "Study", sources_rows) + print("Success: Data inserted into the DataClass: Study") +except Exception as e: + print(f'Error inserting data: {e}') + exit(1) + + +# %% [markdown] +# ## Create and Populate Source Type 'Patient' (Linked to Study) + +# %% +# Define the source Excel file for study data +SOURCE_PATIENT = 'Patient.xlsx' + +# Read data from the Excel file +try: + df = pd.read_excel(SOURCE_PATIENT) +except Exception as e: + print(f'Error reading Excel file {SOURCE_PATIENT}: {e}') + exit(1) + +# Extract column names except for the last column SourceStudy and SourceID +columns = df.columns.tolist()[1:-1] + +# Define LabKey fields for the DataClass +labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] +labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns] + +# Define DataClass domain +patient_domain_definition = { + "kind": "DataClass", + "domainDesign": { + "name": "Patient", + "fields": labkey_fields + }, + "options": { + "category": "sources", + "name": "Patient", + "importAliases": { + + "SourceStudy": { + + "inputType": "dataInputs/Study" + + } + + } + } +} + +# Create the DataClass domain in LabKey +try: + created_dataclass_domain = api.domain.create(patient_domain_definition) + print("Success: Domain created for sample source: Patient") +except Exception as e: + print(f'Error creating domain: {e}') + exit(1) + +# Insert data into the DataClass 'Patient' + +sources_rows = [] + +# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage +for i, row in df.iterrows(): + sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict()) + sources_rows[-1]['Name'] = row['SourceID'] + + +# Insert data into the DataClass 'Study' +try: + insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows) + print("Success: Data inserted into the DataClass: Patient") +except Exception as e: + print(f'Error inserting data: {e}') + exit(1) + +# %% [markdown] +# ## Process Samples and Create Storage Hierarchy + +# %% +SOURCE_SAMPLES = 'Samples.xlsx' + +# Read data from the Excel file + +try: + df = pd.read_excel(SOURCE_SAMPLES) +except Exception as e: + print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}') + exit(1) + +# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier. +df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True) + +# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass +# Also exclude Sample ID column. It will be renamed to "Name" column. + +columns = df.columns[1:-1].tolist() +columns + +# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location +# Note: Change the heirarchy according to the data in excel file + +df['Building'] = df['Location'].str.split('/').str[0] +df['Floor'] = df['Location'].str.split('/').str[1] +df['Freezer'] = df['Location'].str.split('/').str[2] +df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer'] +df['Shelf'] = df['Location'].str.split('/').str[3] +df['Rack'] = df['Location'].str.split('/').str[4] +df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6] +df['Coordinates'] = df['Box'].str.split(':').str[-1] +df['Box'] = df['Box'].str.split(':').str[0] + +# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN +df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1]) + +df['StorageCol'] = None +df['StorageRow'] = None +df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int) +df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0] + + +# ### Create storage heierarchy in LabKey + +# Create Unit Type Box +try: + api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'}) +except: + pass + +result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10}) +box_type_id = result['data']['rowId'] + +df['box_id'] = '' +# Physical Locations +for building in df['Building'].dropna().unique(): + # 'Physical Location' -> + result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"}) + building_id = result_building['data']['rowId'] + building_df = df.loc[df['Building'] == building] + for floor in building_df['Floor'].dropna().unique(): + result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id}) + floor_id = result_floor['data']['rowId'] + floor_df = building_df.loc[building_df['Floor'] == floor] + # Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage + for freezer in floor_df['Freezer'].dropna().unique()[:1]: + freezer_full = '{}-{}-{}'.format(building, floor, freezer) + freezer_desc = 'Liquid Nitrogen Room' + if 'Freezer' in freezer: + freezer_desc = 'Freezer' + result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id}) + freezer_id = result_freezer['data']['rowId'] + freezer_df = floor_df.loc[floor_df['Freezer'] == freezer] + # Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank + for shelf in freezer_df['Shelf'].dropna().unique(): + shelf_desc = 'Shelf' + if 'tank' in shelf.lower(): + shelf_desc = 'Tank' + result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id}) + shelf_id = result_shelf['data']['rowId'] + shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf] + # Labkey Terminology = Rack. Our Dataset = Rack / Tower + for rack in shelf_df['Rack'].dropna().unique(): + rack_desc = 'Rack' + if 'tower' in rack.lower(): + rack_desc = 'Tower' + result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id}) + rack_id = result_rack['data']['rowId'] + rack_df = shelf_df.loc[shelf_df['Rack'] == rack] + for box in rack_df['Box'].dropna().unique(): + result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id}) + box_id = result_box['data']['rowId'] + box_df = rack_df.loc[rack_df['Box'] == box] + df.loc[box_df.index, 'box_id'] = box_id + df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box) + print('Created box: {}'.format(box_id)) + +# %% [markdown] +# ## Create Sample Types and Insert Samples into LabKey + +# %% +sample_types = df['SampleType'].unique().tolist() +print(sample_types) + +# Loop over each sample type and create the domain +for sample_type in sample_types: + labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] + + for col in columns: + rangeURI = 'string' + labkey_fields.append({'name': col, 'rangeURI': rangeURI}) + + sample_params = { + 'kind': 'SampleSet', + 'domainDesign': { + 'name': sample_type, + 'fields': labkey_fields, + "domainKindName": "SampleSet", + }, + "options": { + + "name": sample_type, + + "nameExpression": "S-${genId}", + + "aliquotNameExpression": "${${AliquotedFrom}-:withCounter}", + + "importAliases": { + + "SourcePatient": { + + "inputType": "dataInputs/Patient" + + } + } + } + } + # Create domain using API call + sample_domain = api.domain.create(sample_params) + print(f"Domain created for sample type: {sample_type}") + +# Step 1: Prepare Sample Rows +sample_rows = [] +# Add Samples of the Sample Type +for i, row in df.iterrows(): + sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict()) + sample_rows[-1]['Name'] = row['SampleIdentifier'] + + if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']: + del sample_rows[-1]['StorageLocation'] + del sample_rows[-1]['StorageRow'] + del sample_rows[-1]['StorageCol'] + + +# Step 2: Insert Rows +for sample_type in sample_types: + # Filter rows that match the current sample type + filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type] + + # Remove 'Type' column from each row before inserting + rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows] + + # Insert rows using the API + if rows_to_insert: + api.query.insert_rows("samples", sample_type, rows_to_insert) + print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.") + +# %% [markdown] +# ## Optional Cleanup (Commented) + +# %% +# Delete all sample types and their data + +''' +for sample_type in sample_types: + + drop_response = api.domain.drop("samples", sample_type) + if "success" in drop_response: + print("The dataset domain was deleted.")' +''' + +