From 2162543b504ab3034eef7efcf078c6f8c446e152 Mon Sep 17 00:00:00 2001
From: Ankit Izardar <ankit.izardar@unibas.ch>
Date: Wed, 4 Jun 2025 13:00:19 +0000
Subject: [PATCH] add code for checking duplicates and a function to create a
 box of given size

---
 Sample_Manager/Script.ipynb |  43 ++++-
 Sample_Manager/Script.py    | 376 ------------------------------------
 2 files changed, 36 insertions(+), 383 deletions(-)
 delete mode 100644 Sample_Manager/Script.py

diff --git a/Sample_Manager/Script.ipynb b/Sample_Manager/Script.ipynb
index 1f6ad34..c90fece 100644
--- a/Sample_Manager/Script.ipynb
+++ b/Sample_Manager/Script.ipynb
@@ -293,6 +293,32 @@
     "### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e87c18f0",
+   "metadata": {},
+   "source": [
+    "### First check for duplicate locations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d9568dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get a list of duplicates for Location column in df\n",
+    "\n",
+    "duplicates = df[df.duplicated(['Location'], keep=False)]\n",
+    "\n",
+    "# print duplicates locations\n",
+    "if not duplicates.empty:\n",
+    "    print(\"Duplicates found in 'Location' column:\")\n",
+    "    for index, row in duplicates.iterrows():\n",
+    "        print(f\"Row {index}: {row['Location']}\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
@@ -353,13 +379,16 @@
     "# ### Create storage heierarchy in LabKey\n",
     "\n",
     "# Create Unit Type Box\n",
-    "try:\n",
-    "    api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})\n",
-    "except:\n",
-    "    pass\n",
     "\n",
-    "result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})\n",
-    "box_type_id = result['data']['rowId']\n",
+    "def create_storage_unit_type(api, unit_type_name, description, cols, rows):\n",
+    "    try:\n",
+    "        result = api.storage.create_storage_item('Storage Unit Type', {'name': unit_type_name, 'description': description,'UnitType': 'Box', 'cols': cols, 'rows': rows})\n",
+    "        return result['data']['rowId']\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error creating storage unit type {unit_type_name}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "box_type_id_10_10 = create_storage_unit_type(api, 'Box 10x10', '10x10 Box for storing samples', 10, 10)\n",
     "\n",
     "df['box_id'] = ''\n",
     "# Physical Locations\n",
@@ -398,7 +427,7 @@
     "                    rack_id = result_rack['data']['rowId']\n",
     "                    rack_df = shelf_df.loc[shelf_df['Rack'] == rack]\n",
     "                    for box in rack_df['Box'].dropna().unique():\n",
-    "                        result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})\n",
+    "                        result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id_10_10, 'locationId': rack_id})\n",
     "                        box_id = result_box['data']['rowId']\n",
     "                        box_df = rack_df.loc[rack_df['Box'] == box]\n",
     "                        df.loc[box_df.index, 'box_id'] = box_id\n",
diff --git a/Sample_Manager/Script.py b/Sample_Manager/Script.py
deleted file mode 100644
index 7e80f7c..0000000
--- a/Sample_Manager/Script.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# %% [markdown]
-# ## Import Libraries
-
-# %%
-
-import labkey
-from labkey.api_wrapper import APIWrapper
-import pandas as pd
-import json
-import urllib3
-import urllib
-import os
-
-
-# %% [markdown]
-# ## Project Configuration and LabKey API Initialization
-
-# %%
-
-# Define project and LabKey server details
-PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'
-LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch"
-CONTEXT_PATH = '' # Use 'labkey' for main server
-
-# Initialize LabKey API Wrapper
-api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)
-
-
-# %% [markdown]
-# ## Authentication Setup
-
-# %%
-
-# Path to .netrc file for authentication
-NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')
-
-# Verify and read .netrc file
-if not os.path.isfile(NETRC_FILE):
-    raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')
-
-# Extract login credentials
-netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])
-login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]
-password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]
-
-# Authentication headers
-headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')
-
-
-# %% [markdown]
-# ## Verify Project Directory
-
-# %%
-
-params = {"includeSubfolders": True, "depth": 1}
-url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20'))
-
-resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)
-if resp.status_code == 404:
-    raise Exception(f'Project not found: {PROJECT}. Please create it first.')
-
-
-# %% [markdown]
-# ## Create and Populate Source Type 'Study'
-
-# %%
-# Define the source Excel file for study data
-SOURCE_STUDY = 'Study.xlsx'
-
-# Read data from the Excel file
-try:
-    df = pd.read_excel(SOURCE_STUDY)
-except Exception as e:
-    print(f'Error reading Excel file {SOURCE_STUDY}: {e}')
-    exit(1)
-
-# Extract column names
-columns = df.columns[1:].tolist()
-
-# Define LabKey fields for the DataClass
-labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
-labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
-
-# Define DataClass domain
-study_domain_definition = {
-    "kind": "DataClass",
-    "domainDesign": {
-        "name": "Study",
-        "fields": labkey_fields
-    },
-    "options": {
-        "category": "sources"
-    }
-}
-
-# Create the DataClass domain in LabKey
-try:
-    created_dataclass_domain = api.domain.create(study_domain_definition)
-    print("Success: Domain created for sample source: Study")
-except Exception as e:
-    print(f'Error creating domain: {e}')
-    exit(1)
-
-# Insert data into the DataClass 'Study'
-
-sources_rows = []
-# Add Samples of the Sample Type
-for i, row in df.iterrows():
-    sources_rows.append(row[columns].fillna('').to_dict())
-    sources_rows[-1]['Name'] = row['SourceID']
-
-
-# Insert data into the DataClass 'Study'
-try:
-    insert_result = api.query.insert_rows("exp.data", "Study", sources_rows)
-    print("Success: Data inserted into the DataClass: Study")
-except Exception as e:
-    print(f'Error inserting data: {e}')
-    exit(1)
-
-
-# %% [markdown]
-# ## Create and Populate Source Type 'Patient' (Linked to Study)
-
-# %%
-# Define the source Excel file for study data
-SOURCE_PATIENT = 'Patient.xlsx'
-
-# Read data from the Excel file
-try:
-    df = pd.read_excel(SOURCE_PATIENT)
-except Exception as e:
-    print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')
-    exit(1)
-
-# Extract column names except for the last column SourceStudy and SourceID
-columns = df.columns.tolist()[1:-1]
-
-# Define LabKey fields for the DataClass
-labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
-labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
-
-# Define DataClass domain
-patient_domain_definition = {
-    "kind": "DataClass",
-    "domainDesign": {
-        "name": "Patient",
-        "fields": labkey_fields
-    },
-    "options": {
-        "category": "sources",
-        "name": "Patient",
-        "importAliases": {
-
-            "SourceStudy": {
-
-                "inputType": "dataInputs/Study"
-
-            }
-
-        }
-    }
-}
-
-# Create the DataClass domain in LabKey
-try:
-    created_dataclass_domain = api.domain.create(patient_domain_definition)
-    print("Success: Domain created for sample source: Patient")
-except Exception as e:
-    print(f'Error creating domain: {e}')
-    exit(1)
-
-# Insert data into the DataClass 'Patient'
-
-sources_rows = []
-
-# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage 
-for i, row in df.iterrows():
-    sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())
-    sources_rows[-1]['Name'] = row['SourceID']
-
-
-# Insert data into the DataClass 'Study'
-try:
-    insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows)
-    print("Success: Data inserted into the DataClass: Patient")
-except Exception as e:
-    print(f'Error inserting data: {e}')
-    exit(1)
-
-# %% [markdown]
-# ## Process Samples and Create Storage Hierarchy
-### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
-
-# %%
-SOURCE_SAMPLES = 'Samples.xlsx'
-
-# Read data from the Excel file
-
-try:
-    df = pd.read_excel(SOURCE_SAMPLES)
-except Exception as e:
-    print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')
-    exit(1)
-
-# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier. 
-df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)
-
-# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass
-# Also exclude Sample ID column. It will be renamed to "Name" column. 
-
-columns = df.columns[1:-1].tolist()
-columns
-
-# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location 
-# Note: Change the heirarchy according to the data in excel file
-
-df['Building'] = df['Location'].str.split('/').str[0]
-df['Floor'] = df['Location'].str.split('/').str[1]
-df['Freezer'] = df['Location'].str.split('/').str[2]
-df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']
-df['Shelf'] = df['Location'].str.split('/').str[3]
-df['Rack'] = df['Location'].str.split('/').str[4]
-df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]
-df['Coordinates'] = df['Box'].str.split(':').str[-1]
-df['Box'] = df['Box'].str.split(':').str[0]
-
-# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN
-df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])
-
-df['StorageCol'] = None
-df['StorageRow'] = None
-df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)
-df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]
-
-
-# ### Create storage heierarchy in LabKey
-
-# Create Unit Type Box
-try:
-    api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})
-except:
-    pass
-
-result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})
-box_type_id = result['data']['rowId']
-
-df['box_id'] = ''
-# Physical Locations
-for building in df['Building'].dropna().unique():
-    # 'Physical Location' -> 
-    result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"})
-    building_id = result_building['data']['rowId']
-    building_df = df.loc[df['Building'] == building]
-    for floor in building_df['Floor'].dropna().unique():
-        result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})
-        floor_id = result_floor['data']['rowId']
-        floor_df = building_df.loc[building_df['Floor'] == floor]
-        # Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage
-        for freezer in floor_df['Freezer'].dropna().unique()[:1]:
-            freezer_full = '{}-{}-{}'.format(building, floor, freezer)
-            freezer_desc = 'Liquid Nitrogen Room'
-            if 'Freezer' in freezer:
-                freezer_desc = 'Freezer'
-            result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})
-            freezer_id = result_freezer['data']['rowId']
-            freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]
-            # Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank
-            for shelf in freezer_df['Shelf'].dropna().unique():
-                shelf_desc = 'Shelf'
-                if 'tank' in shelf.lower():
-                    shelf_desc = 'Tank'
-                result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})
-                shelf_id = result_shelf['data']['rowId']
-                shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]
-                # Labkey Terminology = Rack. Our Dataset = Rack / Tower
-                for rack in shelf_df['Rack'].dropna().unique():
-                    rack_desc = 'Rack'
-                    if 'tower' in rack.lower():
-                        rack_desc = 'Tower'
-                    result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})
-                    rack_id = result_rack['data']['rowId']
-                    rack_df = shelf_df.loc[shelf_df['Rack'] == rack]
-                    for box in rack_df['Box'].dropna().unique():
-                        result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})
-                        box_id = result_box['data']['rowId']
-                        box_df = rack_df.loc[rack_df['Box'] == box]
-                        df.loc[box_df.index, 'box_id'] = box_id
-                        df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)
-                        print('Created box: {}'.format(box_id))
-
-# %% [markdown]
-# ## Create Sample Types and Insert Samples into LabKey
-
-# %%
-sample_types = df['SampleType'].unique().tolist()
-print(sample_types)
-
-# Loop over each sample type and create the domain
-for sample_type in sample_types:
-    labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
-    
-    for col in columns:
-        rangeURI = 'string'
-        labkey_fields.append({'name': col, 'rangeURI': rangeURI})
-    
-    sample_params = {
-        'kind': 'SampleSet',
-        'domainDesign': {
-            'name': sample_type,
-            'fields': labkey_fields,
-            "domainKindName": "SampleSet",
-        },
-        "options": {
-
-        "name": sample_type,
-
-        "nameExpression": "S-${genId}",
-
-        "aliquotNameExpression": "${${AliquotedFrom}-:withCounter}",
-
-        "importAliases": {
-
-            "SourcePatient": {
-
-                "inputType": "dataInputs/Patient"
-
-            }
-        }
-    }
-    }
-    # Create domain using API call
-    sample_domain = api.domain.create(sample_params)
-    print(f"Domain created for sample type: {sample_type}")
-
-# Step 1: Prepare Sample Rows
-sample_rows = []
-# Add Samples of the Sample Type
-for i, row in df.iterrows():
-    sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())
-    sample_rows[-1]['Name'] = row['SampleIdentifier']
-
-    if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:
-        del sample_rows[-1]['StorageLocation']
-        del sample_rows[-1]['StorageRow']
-        del sample_rows[-1]['StorageCol']
-
-
-# Step 2: Insert Rows
-for sample_type in sample_types:
-    # Filter rows that match the current sample type
-    filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type]
-
-    # Remove 'Type' column from each row before inserting
-    rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows]
-
-    # Insert rows using the API
-    if rows_to_insert:
-        api.query.insert_rows("samples", sample_type, rows_to_insert)
-        print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.")
-
-# %% [markdown]
-# ## Optional Cleanup (Commented)
-
-# %%
-# Delete all sample types and their data
-
-'''
-for sample_type in sample_types:
-
-    drop_response = api.domain.drop("samples", sample_type)
-    if "success" in drop_response:
-        print("The dataset domain was deleted.")'
-'''
-
-
-- 
GitLab