Skip to content
Snippets Groups Projects
Commit fa1f440d authored by Ankit Izardar's avatar Ankit Izardar
Browse files

added info about box sizes

parent a33837f3
No related branches found
No related tags found
No related merge requests found
......@@ -23,7 +23,7 @@ The Python script automates the creation, organization, and population of data i
### 4. 🧪 Samples and Storage Hierarchy
- **Reads and processes** sample data from `Samples.xlsx`.
- **Parses** complex hierarchical storage location information from the data (Building → Floor → Freezer → Shelf → Rack → Box → Coordinates).
- **Parses** complex hierarchical storage location information from the data (Building → Floor → Freezer → Shelf → Rack → Box → Coordinates). Note: By defualt boxes of size 10x10 are created. Always confimr with user about the box size.
- **Programmatically creates** a structured hierarchy of physical storage locations within LabKey, mirroring the data’s structure.
- **Associates** each sample with the correct storage coordinates within LabKey.
......
%% Cell type:markdown id:f154826c tags:
## Import Libraries
%% Cell type:code id:e650569d tags:
``` python
import labkey
from labkey.api_wrapper import APIWrapper
import pandas as pd
import json
import urllib3
import urllib
import os
```
%% Cell type:markdown id:df672b72 tags:
## Project Configuration and LabKey API Initialization
%% Cell type:code id:2f599db6 tags:
``` python
# Define project and LabKey server details
PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'
LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch"
CONTEXT_PATH = '' # Use 'labkey' for main server
# Initialize LabKey API Wrapper
api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)
```
%% Cell type:markdown id:3def1a57 tags:
## Authentication Setup
%% Cell type:code id:78ec9c0a tags:
``` python
# Path to .netrc file for authentication
NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')
# Verify and read .netrc file
if not os.path.isfile(NETRC_FILE):
raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')
# Extract login credentials
netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])
login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]
password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]
# Authentication headers
headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')
```
%% Cell type:markdown id:f5f58e20 tags:
## Verify Project Directory
%% Cell type:code id:cef2e4e8 tags:
``` python
params = {"includeSubfolders": True, "depth": 1}
url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20'))
resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)
if resp.status_code == 404:
raise Exception(f'Project not found: {PROJECT}. Please create it first.')
```
%% Cell type:markdown id:9203b282 tags:
## Create and Populate Source Type 'Study'
%% Cell type:code id:0d422897 tags:
``` python
# Define the source Excel file for study data
SOURCE_STUDY = 'Study.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_STUDY)
except Exception as e:
print(f'Error reading Excel file {SOURCE_STUDY}: {e}')
exit(1)
# Extract column names
columns = df.columns[1:].tolist()
# Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain
study_domain_definition = {
"kind": "DataClass",
"domainDesign": {
"name": "Study",
"fields": labkey_fields
},
"options": {
"category": "sources"
}
}
# Create the DataClass domain in LabKey
try:
created_dataclass_domain = api.domain.create(study_domain_definition)
print("Success: Domain created for sample source: Study")
except Exception as e:
print(f'Error creating domain: {e}')
exit(1)
# Insert data into the DataClass 'Study'
sources_rows = []
# Add Samples of the Sample Type
for i, row in df.iterrows():
sources_rows.append(row[columns].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study'
try:
insert_result = api.query.insert_rows("exp.data", "Study", sources_rows)
print("Success: Data inserted into the DataClass: Study")
except Exception as e:
print(f'Error inserting data: {e}')
exit(1)
```
%% Output
/Users/izarda0000/miniconda3/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Success: Domain created for sample source: Study
Success: Data inserted into the DataClass: Study
%% Cell type:markdown id:cee50632 tags:
## Create and Populate Source Type 'Patient' (Linked to Study)
%% Cell type:code id:dfb86ebf tags:
``` python
# Define the source Excel file for study data
SOURCE_PATIENT = 'Patient.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_PATIENT)
except Exception as e:
print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')
exit(1)
# Extract column names except for the last column SourceStudy and SourceID
columns = df.columns.tolist()[1:-1]
# Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain
patient_domain_definition = {
"kind": "DataClass",
"domainDesign": {
"name": "Patient",
"fields": labkey_fields
},
"options": {
"category": "sources",
"name": "Patient",
"importAliases": {
"SourceStudy": {
"inputType": "dataInputs/Study"
}
}
}
}
# Create the DataClass domain in LabKey
try:
created_dataclass_domain = api.domain.create(patient_domain_definition)
print("Success: Domain created for sample source: Patient")
except Exception as e:
print(f'Error creating domain: {e}')
exit(1)
# Insert data into the DataClass 'Patient'
sources_rows = []
# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage
for i, row in df.iterrows():
sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study'
try:
insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows)
print("Success: Data inserted into the DataClass: Patient")
except Exception as e:
print(f'Error inserting data: {e}')
exit(1)
```
%% Output
Success: Domain created for sample source: Patient
Success: Data inserted into the DataClass: Patient
%% Cell type:markdown id:36d27f01 tags:
## Process Samples and Create Storage Hierarchy
### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
%% Cell type:code id:8a5bf323 tags:
``` python
SOURCE_SAMPLES = 'Samples.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_SAMPLES)
except Exception as e:
print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')
exit(1)
# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier.
df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)
# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass
# Also exclude Sample ID column. It will be renamed to "Name" column.
columns = df.columns[1:-1].tolist()
columns
# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location
# Note: Change the heirarchy according to the data in excel file
df['Building'] = df['Location'].str.split('/').str[0]
df['Floor'] = df['Location'].str.split('/').str[1]
df['Freezer'] = df['Location'].str.split('/').str[2]
df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']
df['Shelf'] = df['Location'].str.split('/').str[3]
df['Rack'] = df['Location'].str.split('/').str[4]
df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]
df['Coordinates'] = df['Box'].str.split(':').str[-1]
df['Box'] = df['Box'].str.split(':').str[0]
# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN
df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])
df['StorageCol'] = None
df['StorageRow'] = None
df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)
df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]
# ### Create storage heierarchy in LabKey
# Create Unit Type Box
try:
api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})
except:
pass
result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})
box_type_id = result['data']['rowId']
df['box_id'] = ''
# Physical Locations
for building in df['Building'].dropna().unique():
# 'Physical Location' ->
result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"})
building_id = result_building['data']['rowId']
building_df = df.loc[df['Building'] == building]
for floor in building_df['Floor'].dropna().unique():
result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})
floor_id = result_floor['data']['rowId']
floor_df = building_df.loc[building_df['Floor'] == floor]
# Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage
for freezer in floor_df['Freezer'].dropna().unique()[:1]:
freezer_full = '{}-{}-{}'.format(building, floor, freezer)
freezer_desc = 'Liquid Nitrogen Room'
if 'Freezer' in freezer:
freezer_desc = 'Freezer'
result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})
freezer_id = result_freezer['data']['rowId']
freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]
# Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank
for shelf in freezer_df['Shelf'].dropna().unique():
shelf_desc = 'Shelf'
if 'tank' in shelf.lower():
shelf_desc = 'Tank'
result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})
shelf_id = result_shelf['data']['rowId']
shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]
# Labkey Terminology = Rack. Our Dataset = Rack / Tower
for rack in shelf_df['Rack'].dropna().unique():
rack_desc = 'Rack'
if 'tower' in rack.lower():
rack_desc = 'Tower'
result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})
rack_id = result_rack['data']['rowId']
rack_df = shelf_df.loc[shelf_df['Rack'] == rack]
for box in rack_df['Box'].dropna().unique():
result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})
box_id = result_box['data']['rowId']
box_df = rack_df.loc[rack_df['Box'] == box]
df.loc[box_df.index, 'box_id'] = box_id
df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)
print('Created box: {}'.format(box_id))
```
%% Output
Created box: 6522
Created box: 6523
%% Cell type:markdown id:530f62a9 tags:
## Create Sample Types and Insert Samples into LabKey
%% Cell type:code id:105001ab tags:
``` python
sample_types = df['SampleType'].unique().tolist()
print(sample_types)
# Loop over each sample type and create the domain
for sample_type in sample_types:
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
for col in columns:
rangeURI = 'string'
labkey_fields.append({'name': col, 'rangeURI': rangeURI})
sample_params = {
'kind': 'SampleSet',
'domainDesign': {
'name': sample_type,
'fields': labkey_fields,
"domainKindName": "SampleSet",
},
"options": {
"name": sample_type,
"nameExpression": "S-${genId}",
"aliquotNameExpression": "${${AliquotedFrom}-:withCounter}",
"importAliases": {
"SourcePatient": {
"inputType": "dataInputs/Patient"
}
}
}
}
# Create domain using API call
sample_domain = api.domain.create(sample_params)
print(f"Domain created for sample type: {sample_type}")
# Step 1: Prepare Sample Rows
sample_rows = []
# Add Samples of the Sample Type
for i, row in df.iterrows():
sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())
sample_rows[-1]['Name'] = row['SampleIdentifier']
if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:
del sample_rows[-1]['StorageLocation']
del sample_rows[-1]['StorageRow']
del sample_rows[-1]['StorageCol']
# Step 2: Insert Rows
for sample_type in sample_types:
# Filter rows that match the current sample type
filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type]
# Remove 'Type' column from each row before inserting
rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows]
# Insert rows using the API
if rows_to_insert:
api.query.insert_rows("samples", sample_type, rows_to_insert)
print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.")
```
%% Output
['Blood', 'Saliva']
Domain created for sample type: Blood
Domain created for sample type: Saliva
Inserted 40 rows into Blood domain.
Inserted 15 rows into Saliva domain.
%% Cell type:markdown id:b525f127 tags:
## Optional Cleanup (Commented)
%% Cell type:code id:a92c91e9 tags:
``` python
# Delete all sample types and their data
'''
for sample_type in sample_types:
drop_response = api.domain.drop("samples", sample_type)
if "success" in drop_response:
print("The dataset domain was deleted.")'
'''
```
%% Output
'\nfor sample_type in sample_types:\n\n drop_response = api.domain.drop("samples", sample_type)\n if "success" in drop_response:\n print("The dataset domain was deleted.")\'\n'
......
File deleted
......@@ -190,6 +190,7 @@ except Exception as e:
# %% [markdown]
# ## Process Samples and Create Storage Hierarchy
### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
# %%
SOURCE_SAMPLES = 'Samples.xlsx'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment