Skip to content
Snippets Groups Projects
Commit 2162543b authored by Ankit Izardar's avatar Ankit Izardar
Browse files

add code for checking duplicates and a function to create a box of given size

parent 0b7f4576
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:f154826c tags: %% Cell type:markdown id:f154826c tags:
## Import Libraries ## Import Libraries
%% Cell type:code id:e650569d tags: %% Cell type:code id:e650569d tags:
``` python ``` python
import labkey import labkey
from labkey.api_wrapper import APIWrapper from labkey.api_wrapper import APIWrapper
import pandas as pd import pandas as pd
import json import json
import urllib3 import urllib3
import urllib import urllib
import os import os
``` ```
%% Cell type:markdown id:df672b72 tags: %% Cell type:markdown id:df672b72 tags:
## Project Configuration and LabKey API Initialization ## Project Configuration and LabKey API Initialization
%% Cell type:code id:2f599db6 tags: %% Cell type:code id:2f599db6 tags:
``` python ``` python
# Define project and LabKey server details # Define project and LabKey server details
PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset' PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'
LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch" LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch"
CONTEXT_PATH = '' # Use 'labkey' for main server CONTEXT_PATH = '' # Use 'labkey' for main server
# Initialize LabKey API Wrapper # Initialize LabKey API Wrapper
api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True) api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)
``` ```
%% Cell type:markdown id:3def1a57 tags: %% Cell type:markdown id:3def1a57 tags:
## Authentication Setup ## Authentication Setup
%% Cell type:code id:78ec9c0a tags: %% Cell type:code id:78ec9c0a tags:
``` python ``` python
# Path to .netrc file for authentication # Path to .netrc file for authentication
NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc') NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')
# Verify and read .netrc file # Verify and read .netrc file
if not os.path.isfile(NETRC_FILE): if not os.path.isfile(NETRC_FILE):
raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}') raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')
# Extract login credentials # Extract login credentials
netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value']) netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])
login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0] login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]
password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0] password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]
# Authentication headers # Authentication headers
headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}') headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')
``` ```
%% Cell type:markdown id:f5f58e20 tags: %% Cell type:markdown id:f5f58e20 tags:
## Verify Project Directory ## Verify Project Directory
%% Cell type:code id:cef2e4e8 tags: %% Cell type:code id:cef2e4e8 tags:
``` python ``` python
params = {"includeSubfolders": True, "depth": 1} params = {"includeSubfolders": True, "depth": 1}
url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20')) url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20'))
resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True) resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)
if resp.status_code == 404: if resp.status_code == 404:
raise Exception(f'Project not found: {PROJECT}. Please create it first.') raise Exception(f'Project not found: {PROJECT}. Please create it first.')
``` ```
%% Cell type:markdown id:9203b282 tags: %% Cell type:markdown id:9203b282 tags:
## Create and Populate Source Type 'Study' ## Create and Populate Source Type 'Study'
%% Cell type:code id:0d422897 tags: %% Cell type:code id:0d422897 tags:
``` python ``` python
# Define the source Excel file for study data # Define the source Excel file for study data
SOURCE_STUDY = 'Study.xlsx' SOURCE_STUDY = 'Study.xlsx'
# Read data from the Excel file # Read data from the Excel file
try: try:
df = pd.read_excel(SOURCE_STUDY) df = pd.read_excel(SOURCE_STUDY)
except Exception as e: except Exception as e:
print(f'Error reading Excel file {SOURCE_STUDY}: {e}') print(f'Error reading Excel file {SOURCE_STUDY}: {e}')
exit(1) exit(1)
# Extract column names # Extract column names
columns = df.columns[1:].tolist() columns = df.columns[1:].tolist()
# Define LabKey fields for the DataClass # Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns] labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain # Define DataClass domain
study_domain_definition = { study_domain_definition = {
"kind": "DataClass", "kind": "DataClass",
"domainDesign": { "domainDesign": {
"name": "Study", "name": "Study",
"fields": labkey_fields "fields": labkey_fields
}, },
"options": { "options": {
"category": "sources" "category": "sources"
} }
} }
# Create the DataClass domain in LabKey # Create the DataClass domain in LabKey
try: try:
created_dataclass_domain = api.domain.create(study_domain_definition) created_dataclass_domain = api.domain.create(study_domain_definition)
print("Success: Domain created for sample source: Study") print("Success: Domain created for sample source: Study")
except Exception as e: except Exception as e:
print(f'Error creating domain: {e}') print(f'Error creating domain: {e}')
exit(1) exit(1)
# Insert data into the DataClass 'Study' # Insert data into the DataClass 'Study'
sources_rows = [] sources_rows = []
# Add Samples of the Sample Type # Add Samples of the Sample Type
for i, row in df.iterrows(): for i, row in df.iterrows():
sources_rows.append(row[columns].fillna('').to_dict()) sources_rows.append(row[columns].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID'] sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study' # Insert data into the DataClass 'Study'
try: try:
insert_result = api.query.insert_rows("exp.data", "Study", sources_rows) insert_result = api.query.insert_rows("exp.data", "Study", sources_rows)
print("Success: Data inserted into the DataClass: Study") print("Success: Data inserted into the DataClass: Study")
except Exception as e: except Exception as e:
print(f'Error inserting data: {e}') print(f'Error inserting data: {e}')
exit(1) exit(1)
``` ```
%% Output %% Output
/Users/izarda0000/miniconda3/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default /Users/izarda0000/miniconda3/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default") warn("Workbook contains no default style, apply openpyxl's default")
Success: Domain created for sample source: Study Success: Domain created for sample source: Study
Success: Data inserted into the DataClass: Study Success: Data inserted into the DataClass: Study
%% Cell type:markdown id:cee50632 tags: %% Cell type:markdown id:cee50632 tags:
## Create and Populate Source Type 'Patient' (Linked to Study) ## Create and Populate Source Type 'Patient' (Linked to Study)
%% Cell type:code id:dfb86ebf tags: %% Cell type:code id:dfb86ebf tags:
``` python ``` python
# Define the source Excel file for study data # Define the source Excel file for study data
SOURCE_PATIENT = 'Patient.xlsx' SOURCE_PATIENT = 'Patient.xlsx'
# Read data from the Excel file # Read data from the Excel file
try: try:
df = pd.read_excel(SOURCE_PATIENT) df = pd.read_excel(SOURCE_PATIENT)
except Exception as e: except Exception as e:
print(f'Error reading Excel file {SOURCE_PATIENT}: {e}') print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')
exit(1) exit(1)
# Extract column names except for the last column SourceStudy and SourceID # Extract column names except for the last column SourceStudy and SourceID
columns = df.columns.tolist()[1:-1] columns = df.columns.tolist()[1:-1]
# Define LabKey fields for the DataClass # Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns] labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain # Define DataClass domain
patient_domain_definition = { patient_domain_definition = {
"kind": "DataClass", "kind": "DataClass",
"domainDesign": { "domainDesign": {
"name": "Patient", "name": "Patient",
"fields": labkey_fields "fields": labkey_fields
}, },
"options": { "options": {
"category": "sources", "category": "sources",
"name": "Patient", "name": "Patient",
"importAliases": { "importAliases": {
"SourceStudy": { "SourceStudy": {
"inputType": "dataInputs/Study" "inputType": "dataInputs/Study"
} }
} }
} }
} }
# Create the DataClass domain in LabKey # Create the DataClass domain in LabKey
try: try:
created_dataclass_domain = api.domain.create(patient_domain_definition) created_dataclass_domain = api.domain.create(patient_domain_definition)
print("Success: Domain created for sample source: Patient") print("Success: Domain created for sample source: Patient")
except Exception as e: except Exception as e:
print(f'Error creating domain: {e}') print(f'Error creating domain: {e}')
exit(1) exit(1)
# Insert data into the DataClass 'Patient' # Insert data into the DataClass 'Patient'
sources_rows = [] sources_rows = []
# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage # Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage
for i, row in df.iterrows(): for i, row in df.iterrows():
sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict()) sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID'] sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study' # Insert data into the DataClass 'Study'
try: try:
insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows) insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows)
print("Success: Data inserted into the DataClass: Patient") print("Success: Data inserted into the DataClass: Patient")
except Exception as e: except Exception as e:
print(f'Error inserting data: {e}') print(f'Error inserting data: {e}')
exit(1) exit(1)
``` ```
%% Output %% Output
Success: Domain created for sample source: Patient Success: Domain created for sample source: Patient
Success: Data inserted into the DataClass: Patient Success: Data inserted into the DataClass: Patient
%% Cell type:markdown id:36d27f01 tags: %% Cell type:markdown id:36d27f01 tags:
## Process Samples and Create Storage Hierarchy ## Process Samples and Create Storage Hierarchy
### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!! ### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
%% Cell type:markdown id:e87c18f0 tags:
### First check for duplicate locations
%% Cell type:code id:8d9568dd tags:
``` python
# Get a list of duplicates for Location column in df
duplicates = df[df.duplicated(['Location'], keep=False)]
# print duplicates locations
if not duplicates.empty:
print("Duplicates found in 'Location' column:")
for index, row in duplicates.iterrows():
print(f"Row {index}: {row['Location']}")
```
%% Cell type:code id:8a5bf323 tags: %% Cell type:code id:8a5bf323 tags:
``` python ``` python
SOURCE_SAMPLES = 'Samples.xlsx' SOURCE_SAMPLES = 'Samples.xlsx'
# Read data from the Excel file # Read data from the Excel file
try: try:
df = pd.read_excel(SOURCE_SAMPLES) df = pd.read_excel(SOURCE_SAMPLES)
except Exception as e: except Exception as e:
print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}') print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')
exit(1) exit(1)
# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier. # Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier.
df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True) df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)
# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass # Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass
# Also exclude Sample ID column. It will be renamed to "Name" column. # Also exclude Sample ID column. It will be renamed to "Name" column.
columns = df.columns[1:-1].tolist() columns = df.columns[1:-1].tolist()
columns columns
# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location # ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location
# Note: Change the heirarchy according to the data in excel file # Note: Change the heirarchy according to the data in excel file
df['Building'] = df['Location'].str.split('/').str[0] df['Building'] = df['Location'].str.split('/').str[0]
df['Floor'] = df['Location'].str.split('/').str[1] df['Floor'] = df['Location'].str.split('/').str[1]
df['Freezer'] = df['Location'].str.split('/').str[2] df['Freezer'] = df['Location'].str.split('/').str[2]
df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer'] df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']
df['Shelf'] = df['Location'].str.split('/').str[3] df['Shelf'] = df['Location'].str.split('/').str[3]
df['Rack'] = df['Location'].str.split('/').str[4] df['Rack'] = df['Location'].str.split('/').str[4]
df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6] df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]
df['Coordinates'] = df['Box'].str.split(':').str[-1] df['Coordinates'] = df['Box'].str.split(':').str[-1]
df['Box'] = df['Box'].str.split(':').str[0] df['Box'] = df['Box'].str.split(':').str[0]
# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN # Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN
df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1]) df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])
df['StorageCol'] = None df['StorageCol'] = None
df['StorageRow'] = None df['StorageRow'] = None
df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int) df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)
df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0] df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]
# ### Create storage heierarchy in LabKey # ### Create storage heierarchy in LabKey
# Create Unit Type Box # Create Unit Type Box
try:
api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})
except:
pass
result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10}) def create_storage_unit_type(api, unit_type_name, description, cols, rows):
box_type_id = result['data']['rowId'] try:
result = api.storage.create_storage_item('Storage Unit Type', {'name': unit_type_name, 'description': description,'UnitType': 'Box', 'cols': cols, 'rows': rows})
return result['data']['rowId']
except Exception as e:
print(f"Error creating storage unit type {unit_type_name}: {e}")
return None
box_type_id_10_10 = create_storage_unit_type(api, 'Box 10x10', '10x10 Box for storing samples', 10, 10)
df['box_id'] = '' df['box_id'] = ''
# Physical Locations # Physical Locations
for building in df['Building'].dropna().unique(): for building in df['Building'].dropna().unique():
# 'Physical Location' -> # 'Physical Location' ->
result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"}) result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"})
building_id = result_building['data']['rowId'] building_id = result_building['data']['rowId']
building_df = df.loc[df['Building'] == building] building_df = df.loc[df['Building'] == building]
for floor in building_df['Floor'].dropna().unique(): for floor in building_df['Floor'].dropna().unique():
result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id}) result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})
floor_id = result_floor['data']['rowId'] floor_id = result_floor['data']['rowId']
floor_df = building_df.loc[building_df['Floor'] == floor] floor_df = building_df.loc[building_df['Floor'] == floor]
# Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage # Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage
for freezer in floor_df['Freezer'].dropna().unique()[:1]: for freezer in floor_df['Freezer'].dropna().unique()[:1]:
freezer_full = '{}-{}-{}'.format(building, floor, freezer) freezer_full = '{}-{}-{}'.format(building, floor, freezer)
freezer_desc = 'Liquid Nitrogen Room' freezer_desc = 'Liquid Nitrogen Room'
if 'Freezer' in freezer: if 'Freezer' in freezer:
freezer_desc = 'Freezer' freezer_desc = 'Freezer'
result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id}) result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})
freezer_id = result_freezer['data']['rowId'] freezer_id = result_freezer['data']['rowId']
freezer_df = floor_df.loc[floor_df['Freezer'] == freezer] freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]
# Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank # Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank
for shelf in freezer_df['Shelf'].dropna().unique(): for shelf in freezer_df['Shelf'].dropna().unique():
shelf_desc = 'Shelf' shelf_desc = 'Shelf'
if 'tank' in shelf.lower(): if 'tank' in shelf.lower():
shelf_desc = 'Tank' shelf_desc = 'Tank'
result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id}) result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})
shelf_id = result_shelf['data']['rowId'] shelf_id = result_shelf['data']['rowId']
shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf] shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]
# Labkey Terminology = Rack. Our Dataset = Rack / Tower # Labkey Terminology = Rack. Our Dataset = Rack / Tower
for rack in shelf_df['Rack'].dropna().unique(): for rack in shelf_df['Rack'].dropna().unique():
rack_desc = 'Rack' rack_desc = 'Rack'
if 'tower' in rack.lower(): if 'tower' in rack.lower():
rack_desc = 'Tower' rack_desc = 'Tower'
result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id}) result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})
rack_id = result_rack['data']['rowId'] rack_id = result_rack['data']['rowId']
rack_df = shelf_df.loc[shelf_df['Rack'] == rack] rack_df = shelf_df.loc[shelf_df['Rack'] == rack]
for box in rack_df['Box'].dropna().unique(): for box in rack_df['Box'].dropna().unique():
result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id}) result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id_10_10, 'locationId': rack_id})
box_id = result_box['data']['rowId'] box_id = result_box['data']['rowId']
box_df = rack_df.loc[rack_df['Box'] == box] box_df = rack_df.loc[rack_df['Box'] == box]
df.loc[box_df.index, 'box_id'] = box_id df.loc[box_df.index, 'box_id'] = box_id
df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box) df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)
print('Created box: {}'.format(box_id)) print('Created box: {}'.format(box_id))
``` ```
%% Output %% Output
Created box: 6522 Created box: 6522
Created box: 6523 Created box: 6523
%% Cell type:markdown id:530f62a9 tags: %% Cell type:markdown id:530f62a9 tags:
## Create Sample Types and Insert Samples into LabKey ## Create Sample Types and Insert Samples into LabKey
%% Cell type:code id:105001ab tags: %% Cell type:code id:105001ab tags:
``` python ``` python
sample_types = df['SampleType'].unique().tolist() sample_types = df['SampleType'].unique().tolist()
print(sample_types) print(sample_types)
# Loop over each sample type and create the domain # Loop over each sample type and create the domain
for sample_type in sample_types: for sample_type in sample_types:
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}] labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
for col in columns: for col in columns:
rangeURI = 'string' rangeURI = 'string'
labkey_fields.append({'name': col, 'rangeURI': rangeURI}) labkey_fields.append({'name': col, 'rangeURI': rangeURI})
sample_params = { sample_params = {
'kind': 'SampleSet', 'kind': 'SampleSet',
'domainDesign': { 'domainDesign': {
'name': sample_type, 'name': sample_type,
'fields': labkey_fields, 'fields': labkey_fields,
"domainKindName": "SampleSet", "domainKindName": "SampleSet",
}, },
"options": { "options": {
"name": sample_type, "name": sample_type,
"nameExpression": "S-${genId}", "nameExpression": "S-${genId}",
"aliquotNameExpression": "${${AliquotedFrom}-:withCounter}", "aliquotNameExpression": "${${AliquotedFrom}-:withCounter}",
"importAliases": { "importAliases": {
"SourcePatient": { "SourcePatient": {
"inputType": "dataInputs/Patient" "inputType": "dataInputs/Patient"
} }
} }
} }
} }
# Create domain using API call # Create domain using API call
sample_domain = api.domain.create(sample_params) sample_domain = api.domain.create(sample_params)
print(f"Domain created for sample type: {sample_type}") print(f"Domain created for sample type: {sample_type}")
# Step 1: Prepare Sample Rows # Step 1: Prepare Sample Rows
sample_rows = [] sample_rows = []
# Add Samples of the Sample Type # Add Samples of the Sample Type
for i, row in df.iterrows(): for i, row in df.iterrows():
sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict()) sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())
sample_rows[-1]['Name'] = row['SampleIdentifier'] sample_rows[-1]['Name'] = row['SampleIdentifier']
if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']: if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:
del sample_rows[-1]['StorageLocation'] del sample_rows[-1]['StorageLocation']
del sample_rows[-1]['StorageRow'] del sample_rows[-1]['StorageRow']
del sample_rows[-1]['StorageCol'] del sample_rows[-1]['StorageCol']
# Step 2: Insert Rows # Step 2: Insert Rows
for sample_type in sample_types: for sample_type in sample_types:
# Filter rows that match the current sample type # Filter rows that match the current sample type
filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type] filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type]
# Remove 'Type' column from each row before inserting # Remove 'Type' column from each row before inserting
rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows] rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows]
# Insert rows using the API # Insert rows using the API
if rows_to_insert: if rows_to_insert:
api.query.insert_rows("samples", sample_type, rows_to_insert) api.query.insert_rows("samples", sample_type, rows_to_insert)
print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.") print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.")
``` ```
%% Output %% Output
['Blood', 'Saliva'] ['Blood', 'Saliva']
Domain created for sample type: Blood Domain created for sample type: Blood
Domain created for sample type: Saliva Domain created for sample type: Saliva
Inserted 40 rows into Blood domain. Inserted 40 rows into Blood domain.
Inserted 15 rows into Saliva domain. Inserted 15 rows into Saliva domain.
%% Cell type:markdown id:b525f127 tags: %% Cell type:markdown id:b525f127 tags:
## Optional Cleanup (Commented) ## Optional Cleanup (Commented)
%% Cell type:code id:a92c91e9 tags: %% Cell type:code id:a92c91e9 tags:
``` python ``` python
# Delete all sample types and their data # Delete all sample types and their data
''' '''
for sample_type in sample_types: for sample_type in sample_types:
drop_response = api.domain.drop("samples", sample_type) drop_response = api.domain.drop("samples", sample_type)
if "success" in drop_response: if "success" in drop_response:
print("The dataset domain was deleted.")' print("The dataset domain was deleted.")'
''' '''
``` ```
%% Output %% Output
'\nfor sample_type in sample_types:\n\n drop_response = api.domain.drop("samples", sample_type)\n if "success" in drop_response:\n print("The dataset domain was deleted.")\'\n' '\nfor sample_type in sample_types:\n\n drop_response = api.domain.drop("samples", sample_type)\n if "success" in drop_response:\n print("The dataset domain was deleted.")\'\n'
... ...
......
# %% [markdown]
# ## Import Libraries
# %%
import labkey
from labkey.api_wrapper import APIWrapper
import pandas as pd
import json
import urllib3
import urllib
import os
# %% [markdown]
# ## Project Configuration and LabKey API Initialization
# %%
# Define project and LabKey server details
PROJECT = 'sciCORE-dev/Ankit/Sample_Manager_Test_Dataset'
LABKEY_SERVER = "labkey-pro-dev.scicore.unibas.ch"
CONTEXT_PATH = '' # Use 'labkey' for main server
# Initialize LabKey API Wrapper
api = APIWrapper(LABKEY_SERVER, PROJECT, CONTEXT_PATH, use_ssl=True)
# %% [markdown]
# ## Authentication Setup
# %%
# Path to .netrc file for authentication
NETRC_FILE = os.path.join(os.path.expanduser('~'), '.netrc')
# Verify and read .netrc file
if not os.path.isfile(NETRC_FILE):
raise FileNotFoundError(f'.netrc file not found: {NETRC_FILE}')
# Extract login credentials
netrc_df = pd.read_csv(NETRC_FILE, sep=' ', header=None, names=['key', 'value'])
login = netrc_df.loc[netrc_df['key'] == 'login', 'value'].iloc[0]
password = netrc_df.loc[netrc_df['key'] == 'password', 'value'].iloc[0]
# Authentication headers
headers = urllib3.util.make_headers(basic_auth=f'{login}:{password}')
# %% [markdown]
# ## Verify Project Directory
# %%
params = {"includeSubfolders": True, "depth": 1}
url = api.server_context.build_url("project", "getContainers.view", container_path=PROJECT.replace(' ', '%20'))
resp = api.server_context.make_request(url, urllib.parse.urlencode(params).encode(), headers=headers, non_json_response=True)
if resp.status_code == 404:
raise Exception(f'Project not found: {PROJECT}. Please create it first.')
# %% [markdown]
# ## Create and Populate Source Type 'Study'
# %%
# Define the source Excel file for study data
SOURCE_STUDY = 'Study.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_STUDY)
except Exception as e:
print(f'Error reading Excel file {SOURCE_STUDY}: {e}')
exit(1)
# Extract column names
columns = df.columns[1:].tolist()
# Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain
study_domain_definition = {
"kind": "DataClass",
"domainDesign": {
"name": "Study",
"fields": labkey_fields
},
"options": {
"category": "sources"
}
}
# Create the DataClass domain in LabKey
try:
created_dataclass_domain = api.domain.create(study_domain_definition)
print("Success: Domain created for sample source: Study")
except Exception as e:
print(f'Error creating domain: {e}')
exit(1)
# Insert data into the DataClass 'Study'
sources_rows = []
# Add Samples of the Sample Type
for i, row in df.iterrows():
sources_rows.append(row[columns].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study'
try:
insert_result = api.query.insert_rows("exp.data", "Study", sources_rows)
print("Success: Data inserted into the DataClass: Study")
except Exception as e:
print(f'Error inserting data: {e}')
exit(1)
# %% [markdown]
# ## Create and Populate Source Type 'Patient' (Linked to Study)
# %%
# Define the source Excel file for study data
SOURCE_PATIENT = 'Patient.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_PATIENT)
except Exception as e:
print(f'Error reading Excel file {SOURCE_PATIENT}: {e}')
exit(1)
# Extract column names except for the last column SourceStudy and SourceID
columns = df.columns.tolist()[1:-1]
# Define LabKey fields for the DataClass
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
labkey_fields = [{"name": col, "rangeURI": "string"} for col in columns]
# Define DataClass domain
patient_domain_definition = {
"kind": "DataClass",
"domainDesign": {
"name": "Patient",
"fields": labkey_fields
},
"options": {
"category": "sources",
"name": "Patient",
"importAliases": {
"SourceStudy": {
"inputType": "dataInputs/Study"
}
}
}
}
# Create the DataClass domain in LabKey
try:
created_dataclass_domain = api.domain.create(patient_domain_definition)
print("Success: Domain created for sample source: Patient")
except Exception as e:
print(f'Error creating domain: {e}')
exit(1)
# Insert data into the DataClass 'Patient'
sources_rows = []
# Add Source of the Source Type 'Patient'. Also now include 'SourceStudy' column to add lineage
for i, row in df.iterrows():
sources_rows.append(row[columns + ['SourceStudy']].fillna('').to_dict())
sources_rows[-1]['Name'] = row['SourceID']
# Insert data into the DataClass 'Study'
try:
insert_result = api.query.insert_rows("exp.data", "Patient", sources_rows)
print("Success: Data inserted into the DataClass: Patient")
except Exception as e:
print(f'Error inserting data: {e}')
exit(1)
# %% [markdown]
# ## Process Samples and Create Storage Hierarchy
### By default: Box sizes of 10x10 are created! Please make sure to check the Box size with the user and correct them in the UI after boxes are created before populating with Samples!!!
# %%
SOURCE_SAMPLES = 'Samples.xlsx'
# Read data from the Excel file
try:
df = pd.read_excel(SOURCE_SAMPLES)
except Exception as e:
print(f'Error reading Excel file {SOURCE_SAMPLES}: {e}')
exit(1)
# Sample ID is a reserved field in LabKey, so we need to rename it to SampleIdentifier.
df.rename(columns={'Sample ID': 'SampleIdentifier'}, inplace=True)
# Get columns for samples table but do not include the 'SourcePatient'. It is only used for mapping to the Patient DataClass
# Also exclude Sample ID column. It will be renamed to "Name" column.
columns = df.columns[1:-1].tolist()
columns
# ### Get Building, Floor, Freezer, Shelf, Rack, Box, and Coordinates from Location
# Note: Change the heirarchy according to the data in excel file
df['Building'] = df['Location'].str.split('/').str[0]
df['Floor'] = df['Location'].str.split('/').str[1]
df['Freezer'] = df['Location'].str.split('/').str[2]
df['Freezer full'] = df['Building'] + '/' + df['Floor'] + '/' + df['Freezer']
df['Shelf'] = df['Location'].str.split('/').str[3]
df['Rack'] = df['Location'].str.split('/').str[4]
df['Box'] = df['Location'].str.split('/').str[5] + '/' + df['Location'].str.split('/').str[6]
df['Coordinates'] = df['Box'].str.split(':').str[-1]
df['Box'] = df['Box'].str.split(':').str[0]
# Convert the 'Coordinates' column to numeric, invalid parsing will be set as NaN
df['StorageCol'] = pd.to_numeric(df['Coordinates'].str.split('/').str[-1])
df['StorageCol'] = None
df['StorageRow'] = None
df.loc[~df['Coordinates'].isna(), 'StorageCol'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[-1].astype(int)
df.loc[~df['Coordinates'].isna(), 'StorageRow'] = df.loc[~df['Coordinates'].isna(), 'Coordinates'].str.split('/').str[0]
# ### Create storage heierarchy in LabKey
# Create Unit Type Box
try:
api.storage.delete_storage_item('Storage Unit Type', {'name': 'Box 10x10'})
except:
pass
result = api.storage.create_storage_item('Storage Unit Type', {'name': 'Box 10x10', 'description': 'Box 10x10', 'UnitType': 'Box', 'cols': 10, 'rows': 10})
box_type_id = result['data']['rowId']
df['box_id'] = ''
# Physical Locations
for building in df['Building'].dropna().unique():
# 'Physical Location' ->
result_building = api.storage.create_storage_item("Physical Location", {"name": building, "description": "Building"})
building_id = result_building['data']['rowId']
building_df = df.loc[df['Building'] == building]
for floor in building_df['Floor'].dropna().unique():
result_floor = api.storage.create_storage_item('Physical Location', {'name': floor, 'description': 'Floor', 'locationId': building_id})
floor_id = result_floor['data']['rowId']
floor_df = building_df.loc[building_df['Floor'] == floor]
# Labkey Terminology = Freezer. Our Dataset = Freezer / Liquid Nitrogen Storage
for freezer in floor_df['Freezer'].dropna().unique()[:1]:
freezer_full = '{}-{}-{}'.format(building, floor, freezer)
freezer_desc = 'Liquid Nitrogen Room'
if 'Freezer' in freezer:
freezer_desc = 'Freezer'
result_freezer = api.storage.create_storage_item('Freezer', {'name': freezer_full, 'description': freezer_desc, 'locationId': floor_id})
freezer_id = result_freezer['data']['rowId']
freezer_df = floor_df.loc[floor_df['Freezer'] == freezer]
# Labkey Terminologoy = Shelf. Our Dataset = Shelf / Tank
for shelf in freezer_df['Shelf'].dropna().unique():
shelf_desc = 'Shelf'
if 'tank' in shelf.lower():
shelf_desc = 'Tank'
result_shelf = api.storage.create_storage_item('Shelf', {'name': shelf, 'description': shelf_desc, 'locationId': freezer_id})
shelf_id = result_shelf['data']['rowId']
shelf_df = freezer_df.loc[freezer_df['Shelf'] == shelf]
# Labkey Terminology = Rack. Our Dataset = Rack / Tower
for rack in shelf_df['Rack'].dropna().unique():
rack_desc = 'Rack'
if 'tower' in rack.lower():
rack_desc = 'Tower'
result_rack = api.storage.create_storage_item('Rack', {'name': rack, 'description': rack_desc, 'locationId': shelf_id})
rack_id = result_rack['data']['rowId']
rack_df = shelf_df.loc[shelf_df['Rack'] == rack]
for box in rack_df['Box'].dropna().unique():
result_box = api.storage.create_storage_item('Terminal Storage Location', {'name': box, 'typeId': box_type_id, 'locationId': rack_id})
box_id = result_box['data']['rowId']
box_df = rack_df.loc[rack_df['Box'] == box]
df.loc[box_df.index, 'box_id'] = box_id
df.loc[box_df.index, 'StorageLocation'] = '{}/{}/{}/{}'.format(freezer_full, shelf, rack, box)
print('Created box: {}'.format(box_id))
# %% [markdown]
# ## Create Sample Types and Insert Samples into LabKey
# %%
sample_types = df['SampleType'].unique().tolist()
print(sample_types)
# Loop over each sample type and create the domain
for sample_type in sample_types:
labkey_fields = [{'name': 'Name', 'rangeURI': 'string'}]
for col in columns:
rangeURI = 'string'
labkey_fields.append({'name': col, 'rangeURI': rangeURI})
sample_params = {
'kind': 'SampleSet',
'domainDesign': {
'name': sample_type,
'fields': labkey_fields,
"domainKindName": "SampleSet",
},
"options": {
"name": sample_type,
"nameExpression": "S-${genId}",
"aliquotNameExpression": "${${AliquotedFrom}-:withCounter}",
"importAliases": {
"SourcePatient": {
"inputType": "dataInputs/Patient"
}
}
}
}
# Create domain using API call
sample_domain = api.domain.create(sample_params)
print(f"Domain created for sample type: {sample_type}")
# Step 1: Prepare Sample Rows
sample_rows = []
# Add Samples of the Sample Type
for i, row in df.iterrows():
sample_rows.append(row[columns + ['StorageLocation', 'StorageRow', 'StorageCol', 'SourcePatient']].fillna('').to_dict())
sample_rows[-1]['Name'] = row['SampleIdentifier']
if not row['StorageLocation'] or not row['StorageRow'] or not row['StorageCol']:
del sample_rows[-1]['StorageLocation']
del sample_rows[-1]['StorageRow']
del sample_rows[-1]['StorageCol']
# Step 2: Insert Rows
for sample_type in sample_types:
# Filter rows that match the current sample type
filtered_rows = [row for row in sample_rows if row.get("SampleType") == sample_type]
# Remove 'Type' column from each row before inserting
rows_to_insert = [{k: v for k, v in row.items() if k != "SampleType"} for row in filtered_rows]
# Insert rows using the API
if rows_to_insert:
api.query.insert_rows("samples", sample_type, rows_to_insert)
print(f"Inserted {len(rows_to_insert)} rows into {sample_type} domain.")
# %% [markdown]
# ## Optional Cleanup (Commented)
# %%
# Delete all sample types and their data
'''
for sample_type in sample_types:
drop_response = api.domain.drop("samples", sample_type)
if "success" in drop_response:
print("The dataset domain was deleted.")'
'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment