added info about box sizes

fa1f440d · Ankit Izardar · a33837f3 · fa1f440d · fa1f440d · a33837f3
Commit fa1f440d authored 2 months ago by Ankit Izardar
--- a/Sample_Manager/README.md
+++ b/Sample_Manager/README.md
@@ -23,7 +23,7 @@ The Python script automates the creation, organization, and population of data i

 ### 4. 🧪 Samples and Storage Hierarchy
 - **Reads and processes** sample data from `Samples.xlsx`.
- **Parses** complex hierarchical storage location information from the data (Building → Floor → Freezer → Shelf → Rack → Box → Coordinates).
+- **Parses** complex hierarchical storage location information from the data (Building → Floor → Freezer → Shelf → Rack → Box → Coordinates). Note: By defualt boxes of size 10x10 are created. Always confimr with user about the box size.
 - **Programmatically creates** a structured hierarchy of physical storage locations within LabKey, mirroring the data’s structure.
 - **Associates** each sample with the correct storage coordinates within LabKey.


--- a/Sample_Manager/Script.ipynb
+++ b/Sample_Manager/Script.ipynb
@@ -288,7 +288,9 @@
   "id": "36d27f01",
   "metadata": {},
   "source": [
-    "## Process Samples and Create Storage Hierarchy"
+    "## Process Samples and Create Storage Hierarchy\n",
+    "\n",
+    "### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!"
   ]
  },
  {

 %% Cell type:markdown id:f154826c tags:

 ## Import Libraries

 %% Cell type:code id:e650569d tags:

 ``` python

 import labkey
 from labkey.api_wrapper import APIWrapper
 import pandas as pd
 import json
 import urllib3
 import urllib
 import os
 ```

 %% Cell type:markdown id:df672b72 tags:

 ## Project Configuration and LabKey API Initialization

 %% Cell type:code id:2f599db6 tags:

 ``` python

 # Define project and LabKey server details
 PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'
 LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch"
 CONTEXT_PATH = '' # Use 'labkey' for main server

 # Initialize LabKey API Wrapper
 api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)
 ```

 %% Cell type:markdown id:3def1a57 tags:

 ## Authentication Setup

 %% Cell type:code id:78ec9c0a tags:

 ``` python

 # Path to .netrc file for authentication
 NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')

 # Verify and read .netrc file
 if not os.path.isfile(NETRC_FILE):
    raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')

 # Extract login credentials
 netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])
 login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]
 password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]

 # Authentication headers
 headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')
 ```

 %% Cell type:markdown id:f5f58e20 tags:

 ## Verify Project Directory

 %% Cell type:code id:cef2e4e8 tags:

 ``` python

 params = {"includeSubfolders": True, "depth": 1}
 url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20'))

 resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)
 if resp.status_code == 404:
    raise Exception(f'Project not found: {PROJECT}. Please create it first.')
 ```

 %% Cell type:markdown id:9203b282 tags:

 ## Create and Populate Source Type 'Study'

 %% Cell type:code id:0d422897 tags:

 ``` python
 # Define the source Excel file for study data
 SOURCE_STUDY = 'Study.xlsx'

 # Read data from the Excel file
 try:
    df = pd.read_excel(SOURCE_STUDY)
 except Exception as e:
    print(f'Error reading Excel file {SOURCE_STUDY}: {e}')
    exit(1)

 # Extract column names
 columns = df.columns[1:].tolist()

 # Define LabKey fields for the DataClass
 labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
 labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]

 # Define DataClass domain
 study_domain_definition = {
    "kind": "DataClass",
    "domainDesign": {
        "name": "Study",
        "fields": labkey_fields
    },
    "options": {
        "category": "sources"
    }
 }

 # Create the DataClass domain in LabKey
 try:
    created_dataclass_domain = api.domain.create(study_domain_definition)
    print("Success: Domain created for sample source: Study")
 except Exception as e:
    print(f'Error creating domain: {e}')
    exit(1)

 # Insert data into the DataClass 'Study'

 sources_rows = []
 # Add Samples of the Sample Type
 for i, row in df.iterrows():
    sources_rows.append(row[columns].fillna('').to_dict())
    sources_rows[-1]['Name'] = row['SourceID']


 # Insert data into the DataClass 'Study'
 try:
    insert_result = api.query.insert_rows("exp.data", "Study", sources_rows)
    print("Success: Data inserted into the DataClass: Study")
 except Exception as e:
    print(f'Error inserting data: {e}')
    exit(1)
 ```

 %% Output

    /Users/izarda0000/miniconda3/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
      warn("Workbook contains no default style, apply openpyxl's default")

    Success: Domain created for sample source: Study
    Success: Data inserted into the DataClass: Study

 %% Cell type:markdown id:cee50632 tags:

 ## Create and Populate Source Type 'Patient' (Linked to Study)

 %% Cell type:code id:dfb86ebf tags:

 ``` python
 # Define the source Excel file for study data
 SOURCE_PATIENT = 'Patient.xlsx'

 # Read data from the Excel file
 try:
    df = pd.read_excel(SOURCE_PATIENT)
 except Exception as e:
    print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')
    exit(1)

 # Extract column names except for the last column SourceStudy and SourceID
 columns = df.columns.tolist()[1:-1]

 # Define LabKey fields for the DataClass
 labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
 labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]

 # Define DataClass domain
 patient_domain_definition = {
    "kind": "DataClass",
    "domainDesign": {
        "name": "Patient",
        "fields": labkey_fields
    },
    "options": {
        "category": "sources",
        "name": "Patient",
        "importAliases": {

            "SourceStudy": {

                "inputType": "dataInputs/Study"

            }

        }
    }
 }

 # Create the DataClass domain in LabKey
 try:
    created_dataclass_domain = api.domain.create(patient_domain_definition)
    print("Success: Domain created for sample source: Patient")
 except Exception as e:
    print(f'Error creating domain: {e}')
    exit(1)

 # Insert data into the DataClass 'Patient'

 sources_rows = []

 # Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage
 for i, row in df.iterrows():
    sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())
    sources_rows[-1]['Name'] = row['SourceID']


 # Insert data into the DataClass 'Study'
 try:
    insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows)
    print("Success: Data inserted into the DataClass: Patient")
 except Exception as e:
    print(f'Error inserting data: {e}')
    exit(1)
 ```

 %% Output

    Success: Domain created for sample source: Patient
    Success: Data inserted into the DataClass: Patient

 %% Cell type:markdown id:36d27f01 tags:

 ## Process Samples and Create Storage Hierarchy

+### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
+
 %% Cell type:code id:8a5bf323 tags:

 ``` python
 SOURCE_SAMPLES = 'Samples.xlsx'

 # Read data from the Excel file

 try:
    df = pd.read_excel(SOURCE_SAMPLES)
 except Exception as e:
    print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')
    exit(1)

 # Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier.
 df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)

 # Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass
 # Also exclude Sample ID column. It will be renamed to "Name" column.

 columns = df.columns[1:-1].tolist()
 columns

 # ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location
 # Note: Change the heirarchy according to the data in excel file

 df['Building'] = df['Location'].str.split('/').str[0]
 df['Floor'] = df['Location'].str.split('/').str[1]
 df['Freezer'] = df['Location'].str.split('/').str[2]
 df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']
 df['Shelf'] = df['Location'].str.split('/').str[3]
 df['Rack'] = df['Location'].str.split('/').str[4]
 df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]
 df['Coordinates'] = df['Box'].str.split(':').str[-1]
 df['Box'] = df['Box'].str.split(':').str[0]

 # Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN
 df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])

 df['StorageCol'] = None
 df['StorageRow'] = None
 df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)
 df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]


 # ### Create storage heierarchy in LabKey

 # Create Unit Type Box
 try:
    api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})
 except:
    pass

 result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})
 box_type_id = result['data']['rowId']

 df['box_id'] = ''
 # Physical Locations
 for building in df['Building'].dropna().unique():
    # 'Physical Location' ->
    result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"})
    building_id = result_building['data']['rowId']
    building_df = df.loc[df['Building'] == building]
    for floor in building_df['Floor'].dropna().unique():
        result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})
        floor_id = result_floor['data']['rowId']
        floor_df = building_df.loc[building_df['Floor'] == floor]
        # Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage
        for freezer in floor_df['Freezer'].dropna().unique()[:1]:
            freezer_full = '{}-{}-{}'.format(building, floor, freezer)
            freezer_desc = 'Liquid Nitrogen Room'
            if 'Freezer' in freezer:
                freezer_desc = 'Freezer'
            result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})
            freezer_id = result_freezer['data']['rowId']
            freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]
            # Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank
            for shelf in freezer_df['Shelf'].dropna().unique():
                shelf_desc = 'Shelf'
                if 'tank' in shelf.lower():
                    shelf_desc = 'Tank'
                result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})
                shelf_id = result_shelf['data']['rowId']
                shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]
                # Labkey Terminology = Rack. Our Dataset = Rack / Tower
                for rack in shelf_df['Rack'].dropna().unique():
                    rack_desc = 'Rack'
                    if 'tower' in rack.lower():
                        rack_desc = 'Tower'
                    result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})
                    rack_id = result_rack['data']['rowId']
                    rack_df = shelf_df.loc[shelf_df['Rack'] == rack]
                    for box in rack_df['Box'].dropna().unique():
                        result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})
                        box_id = result_box['data']['rowId']
                        box_df = rack_df.loc[rack_df['Box'] == box]
                        df.loc[box_df.index, 'box_id'] = box_id
                        df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)
                        print('Created box: {}'.format(box_id))
 ```

 %% Output

    Created box: 6522
    Created box: 6523

 %% Cell type:markdown id:530f62a9 tags:

 ## Create Sample Types and Insert Samples into LabKey

 %% Cell type:code id:105001ab tags:

 ``` python
 sample_types = df['SampleType'].unique().tolist()
 print(sample_types)

 # Loop over each sample type and create the domain
 for sample_type in sample_types:
    labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]

    for col in columns:
        rangeURI = 'string'
        labkey_fields.append({'name': col, 'rangeURI': rangeURI})

    sample_params = {
        'kind': 'SampleSet',
        'domainDesign': {
            'name': sample_type,
            'fields': labkey_fields,
            "domainKindName": "SampleSet",
        },
        "options": {

        "name": sample_type,

        "nameExpression": "S-${genId}",

        "aliquotNameExpression": "${${AliquotedFrom}-:withCounter}",

        "importAliases": {

            "SourcePatient": {

                "inputType": "dataInputs/Patient"

            }
        }
    }
    }
    # Create domain using API call
    sample_domain = api.domain.create(sample_params)
    print(f"Domain created for sample type: {sample_type}")

 # Step 1: Prepare Sample Rows
 sample_rows = []
 # Add Samples of the Sample Type
 for i, row in df.iterrows():
    sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())
    sample_rows[-1]['Name'] = row['SampleIdentifier']

    if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:
        del sample_rows[-1]['StorageLocation']
        del sample_rows[-1]['StorageRow']
        del sample_rows[-1]['StorageCol']


 # Step 2: Insert Rows
 for sample_type in sample_types:
    # Filter rows that match the current sample type
    filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type]

    # Remove 'Type' column from each row before inserting
    rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows]

    # Insert rows using the API
    if rows_to_insert:
        api.query.insert_rows("samples", sample_type, rows_to_insert)
        print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.")
 ```

 %% Output

    ['Blood', 'Saliva']
    Domain created for sample type: Blood
    Domain created for sample type: Saliva
    Inserted 40 rows into Blood domain.
    Inserted 15 rows into Saliva domain.

 %% Cell type:markdown id:b525f127 tags:

 ## Optional Cleanup (Commented)

 %% Cell type:code id:a92c91e9 tags:

 ``` python
 # Delete all sample types and their data

 '''
 for sample_type in sample_types:

    drop_response = api.domain.drop("samples", sample_type)
    if "success" in drop_response:
        print("The dataset domain was deleted.")'
 '''
 ```

 %% Output

    '\nfor sample_type in sample_types:\n\n    drop_response = api.domain.drop("samples", sample_type)\n    if "success" in drop_response:\n        print("The dataset domain was deleted.")\'\n'

--- a/Sample_Manager/Script.pdf
+++ b/Sample_Manager/Script.pdf
--- a/Sample_Manager/Script.py
+++ b/Sample_Manager/Script.py
@@ -190,6 +190,7 @@ except Exception as e:

 # %% [markdown]
 # ## Process Samples and Create Storage Hierarchy
+### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!

 # %%
 SOURCE_SAMPLES = 'Samples.xlsx'