From e97528b1ec1e28feb6cba4e4e56da39c4b7043c0 Mon Sep 17 00:00:00 2001
From: "christoph.stritt@unibas.ch" <christoph.stritt@unibas.ch>
Date: Thu, 11 Jul 2024 11:55:04 +0200
Subject: [PATCH] Pangenome graph notebook added

---
 notebooks/create_pangenome_graph.ipynb | 168 ++++++++++++++
 notebooks/pangenome_graph.ipynb        |   0
 notebooks/tree_from_graph.ipynb        | 310 +++++++++++++++++++++++++
 3 files changed, 478 insertions(+)
 create mode 100644 notebooks/create_pangenome_graph.ipynb
 delete mode 100644 notebooks/pangenome_graph.ipynb
 create mode 100644 notebooks/tree_from_graph.ipynb

diff --git a/notebooks/create_pangenome_graph.ipynb b/notebooks/create_pangenome_graph.ipynb
new file mode 100644
index 0000000..8ec974a
--- /dev/null
+++ b/notebooks/create_pangenome_graph.ipynb
@@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create a pangenome graph with pggb\n",
+    "\n",
+    "pggb requires a single gzipped and indexed fasta file containing all assemblies. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install biopython\n",
+    "\n",
+    "If not already present, create a conda environment with biopython. \n",
+    "\n",
+    "```\n",
+    "conda create -n bio biopython\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then select the bio environent as the kernel for the Jupyter notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Combine assemblies\n",
+    "\n",
+    "The script below creates a file 'assemblies_combined.fasta' in which all assemblies listed in 'paths_to_assemblies.txt' are combined.\n",
+    "\n",
+    "'paths_to_assemblies.txt' should be a text file with one path per line. \n",
+    "\n",
+    "If one_contig_only is set to True, only single-contig assemblies are included."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Bio import SeqIO\n",
+    "\n",
+    "\n",
+    "assemblies = 'paths_to_assemblies.txt'\n",
+    "one_contig_only = False\n",
+    "\n",
+    "fasta_combined = []\n",
+    "\n",
+    "with open(assemblies) as f:\n",
+    "    \n",
+    "    for line in f:\n",
+    "        \n",
+    "        assembly_path = line.strip()\n",
+    "        strain = assembly_path.split('/')[-1].split('.')[0]\n",
+    "     \n",
+    "        records = [rec for rec in SeqIO.parse(assembly_path, \"fasta\")]\n",
+    "        \n",
+    "        if one_contig_only and len(records) > 1:\n",
+    "            print('Discarded multi-contig assembly for ' + strain)\n",
+    "            continue\n",
+    "        \n",
+    "        fasta_combined += records\n",
+    "\n",
+    "SeqIO.write(fasta_combined, 'assemblies_combined.fasta', 'fasta')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Zip and index fasta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "gzip single_contig_assemblies.fasta\n",
+    "samtools faidx single_contig_assemblies.fasta.gz"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run pggb\n",
+    "\n",
+    "Important paramters:\n",
+    "\n",
+    "    -i : path to combined assemblies\n",
+    "    -o : output directory name\n",
+    "    -n : Number of contigs included\n",
+    "    -p : Similarity of contigs\n",
+    "    -s : Maximum length of repeats in the genome"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Example slurm script:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#!/bin/bash\n",
+    "\n",
+    "#SBATCH --cpus-per-task=20\n",
+    "#SBATCH --mem-per-cpu=4G\n",
+    "#SBATCH --time=24:00:00\n",
+    "#SBATCH --qos=1day\n",
+    "#SBATCH --output=stdout.%j\n",
+    "#SBATCH --error=stderr.%j\n",
+    "\n",
+    "singularity exec /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif pggb \\\n",
+    "  -i assemblies_combined.fasta.gz \\\n",
+    "  -o ./pggb \\\n",
+    "  -m \\\n",
+    "  -t 20 \\\n",
+    "  -n 52 \\\n",
+    "  -p 99 \\\n",
+    "  -s 5k\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/pangenome_graph.ipynb b/notebooks/pangenome_graph.ipynb
deleted file mode 100644
index e69de29..0000000
diff --git a/notebooks/tree_from_graph.ipynb b/notebooks/tree_from_graph.ipynb
new file mode 100644
index 0000000..b305294
--- /dev/null
+++ b/notebooks/tree_from_graph.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create a phylogenetic tree from a pangenome graph\n",
+    "\n",
+    "Call SNPs from graph with vg deconstruct.\n",
+    "\n",
+    "Filter out resistance genes and clustered variants indicative of gene conversion or selection acting on sloppy targets.\n",
+    "\n",
+    "Maybe determine clustering threshold through simulations?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "singularity exec /home/cristobal/programs/pggb_latest.sif vg deconstruct \\\n",
+    "  pggb/assemblies_combined.fasta.gz.d71a954.11fba48.a4da63a.smooth.final.gfa -d1 -e \\\n",
+    "    -p H37Rv \\\n",
+    "    -t 4 \\\n",
+    "    --all-snarls \\\n",
+    "    > variants.vcf\n",
+    "\n",
+    "grep -v -c ^# variants.vcf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create list of positions to exclude\n",
+    "import pandas\n",
+    "\n",
+    "dr_loci = pandas.read_csv(\"DR_genes_coordinates.tsv\", sep=\"\\t\")\n",
+    "repeats = pandas.read_csv(\"nucmer/delta.out.filtered.tsv\")\n",
+    "\n",
+    "excluded = set()\n",
+    "\n",
+    "for i, gene in dr_loci.iterrows():\n",
+    "    excluded.update(range(gene[\"start_region\"], gene[\"stop_region\"]+1))\n",
+    "\n",
+    "for i,hpair in repeats.iterrows():\n",
+    "    excluded.update(range(hpair[\"S1\"], hpair[\"E1\"]+1))\n",
+    "    excluded.update(range(hpair[\"S2\"], hpair[\"E2\"]+1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "canettii = [\"CIPT_140010059_/_STB-A\", \"ET1291\"]\n",
+    "\n",
+    "keep = []\n",
+    "\n",
+    "discarded = {\n",
+    "    \"In repeat or DR locus\" : 0,\n",
+    "    \"Not SNP\" : 0,\n",
+    "    \"canettii only\" : 0,\n",
+    "    \"Missing genotypes\" : 0,\n",
+    "    \"Invariant\" : 0\n",
+    "}\n",
+    "\n",
+    "with open(\"variants.vcf\") as f:\n",
+    "    \n",
+    "    for line in f:\n",
+    "        \n",
+    "        if line.startswith(\"#CHROM\"):\n",
+    "            \n",
+    "            fields = line.strip().split(\"\\t\")\n",
+    "            strains = fields[9:]\n",
+    "            canettii_i = [strains.index(x) for x in canettii]\n",
+    "            \n",
+    "            # Dictionary to store SNPs for each strain\n",
+    "            strain_seqs = {strain : \"\" for strain in strains}\n",
+    "            strain_seqs[\"H37Rv\"] = \"\"\n",
+    "            continue\n",
+    "        \n",
+    "        elif line.startswith(\"#\"):\n",
+    "            continue\n",
+    "\n",
+    "        fields = line.strip().split(\"\\t\")\n",
+    "        position = int(fields[1])\n",
+    "        \n",
+    "        ref = fields[3]\n",
+    "        alt = fields[4].split(\",\")\n",
+    "        \n",
+    "        gt = fields[9:]\n",
+    "        \n",
+    "        # Two entries weirdly have missing genotypes. Skip them\n",
+    "        if len(gt) != len(strains):\n",
+    "            print(fields)\n",
+    "            continue\n",
+    "        \n",
+    "        \n",
+    "        canettii_gt = [gt[i] for i in canettii_i]\n",
+    "        others = [gt[i] for i in range(len(gt)) if i not in canettii_i]\n",
+    "        \n",
+    "        \n",
+    "        # Exclude sites in repeats and DR loci\n",
+    "        if position in excluded:\n",
+    "            discarded[\"In repeat or DR locus\"] += 1\n",
+    "            continue\n",
+    "\n",
+    "        # Only SNPs\n",
+    "        if len(ref) > 1 or any(len(x) > 1 for x in alt):\n",
+    "            discarded[\"Not SNP\"] += 1\n",
+    "            continue\n",
+    "            \n",
+    "        # Skip canettii-only variants\n",
+    "        if \"1\" in canettii_gt and \"1\" not in others:\n",
+    "            discarded[\"canettii only\"] += 1\n",
+    "            continue\n",
+    "\n",
+    "        # Exclude missing genotypes\n",
+    "        if any(x in gt for x in [\"\", \".\"]):\n",
+    "            discarded[\"Missing genotypes\"] += 1\n",
+    "            #continue\n",
+    "\n",
+    "\n",
+    "        # Write alleles to site\n",
+    "        alleles = [ref] + alt       \n",
+    "        site = ''\n",
+    "        \n",
+    "        for strain in strains:\n",
+    "            strain_i = strains.index(strain)\n",
+    "            strain_gt = gt[strain_i]\n",
+    "            if not strain_gt or strain_gt not in '012345':\n",
+    "                strain_allele = '-'\n",
+    "            else:\n",
+    "                strain_allele = alleles[int(strain_gt)]\n",
+    "            \n",
+    "            site += strain_allele\n",
+    "            \n",
+    "        # Re-check if there are remaining invariant sites\n",
+    "        bases_only = ''\n",
+    "        for allele in site:\n",
+    "            if allele in ['A','C','G', 'T']:\n",
+    "                bases_only += allele\n",
+    "\n",
+    "        #bases_only = ref + site.replace('-', '')\n",
+    "        \n",
+    "        \n",
+    "        if len(set(bases_only)) > 1:\n",
+    "                \n",
+    "            for i, allele in enumerate(site):\n",
+    "                strain = strains[i]\n",
+    "                strain_seqs[strain] += allele\n",
+    "                \n",
+    "            # Write H37Rv allele\n",
+    "            strain_seqs[\"H37Rv\"] += ref\n",
+    "            \n",
+    "            keep.append((position, alleles, gt))\n",
+    "            \n",
+    "        else:\n",
+    "            discarded[\"Invariant\"] += 1\n",
+    "\n",
+    "                \n",
+    "            \n",
+    "print(\"Filtered out:\")\n",
+    "for k in discarded:\n",
+    "    print(k, str(discarded[k]))\n",
+    "    \n",
+    "print(\"Kept: \", str(len(keep)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Write variable alignment to fasta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Bio import SeqIO\n",
+    "from Bio.Seq import Seq\n",
+    "from Bio.SeqRecord import SeqRecord\n",
+    "\n",
+    "records = []\n",
+    "\n",
+    "for strain in [\"H37Rv\"] + strains:\n",
+    "    \n",
+    "    sequence = Seq(strain_seqs[strain])\n",
+    "    record = SeqRecord(sequence, id=strain, name=\"\", description=\"\")\n",
+    "    records.append(record)\n",
+    "    \n",
+    "SeqIO.write(records, \"H37Rv_SNP_alignment.from_graph.fasta\", \"fasta\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Count non-variable positions\n",
+    "\n",
+    "Get the core nodes from the graph and count the number of A,C,G,T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "\n",
+    "GRAPH='/home/cristobal/TB/projects/deletionism/NOTEBOOKS/A_ancestral_reference/pggb/assemblies_combined.fasta.gz.d71a954.11fba48.a4da63a.smooth.final.gfa'\n",
+    "\n",
+    "n_paths = 0\n",
+    "\n",
+    "node_seqs = {}\n",
+    "node_count = {}\n",
+    "\n",
+    "# First traversal: get nodes present in all paths\n",
+    "with open(GRAPH) as f:\n",
+    "    for line in f:\n",
+    "        \n",
+    "        fields = line.strip().split('\\t')\n",
+    "        \n",
+    "        if line.startswith('S'):\n",
+    "            node_seqs[fields[1]] = fields[2]\n",
+    "        \n",
+    "        elif line.startswith('P'):\n",
+    "            \n",
+    "            n_paths += 1\n",
+    "            \n",
+    "            paff = [x[:-1] for x in fields[2].split(',') ]\n",
+    "            \n",
+    "            for node in paff: \n",
+    "                \n",
+    "                if node not in node_count:\n",
+    "                    node_count[node] = 0\n",
+    "                \n",
+    "                node_count[node] += 1\n",
+    "                \n",
+    "invariable_seq = ''\n",
+    "\n",
+    "for node in node_count:\n",
+    "    if node_count[node] == n_paths:\n",
+    "        invariable_seq += node_seqs[node]\n",
+    "        \n",
+    "base_counts = Counter(invariable_seq)\n",
+    "\n",
+    "for base in base_counts:\n",
+    "    print(base, base_counts[base])\n",
+    "\n",
+    "\n",
+    "total = sum([base_counts[base] for base in base_counts])\n",
+    "gc = base_counts['G'] + base_counts['C']\n",
+    "\n",
+    "print('Total invariable bases: ', total)\n",
+    "print('GC content: ', gc/total)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Estimate tree!\n",
+    "\n",
+    "Use Stamatakis model in RAXML to account for invariable sites and correct branch lengths.\n",
+    "\n",
+    "--model GTR+G+ASC_STAM{633511/1206483/1214591/640523}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "~/programs/raxml-ng_v1.2.0_linux_x86_64/raxml-ng \\\n",
+    "  --msa H37Rv_SNP_alignment.from_graph.fasta \\\n",
+    "  --all --model GTR+G+ASC_STAM{633312/1206359/1214398/640320} --tree pars{10} --bs-trees 100 --threads 4 --outgroup CIPT_140010059_/_STB-A\n",
+    "\n",
+    "mv H37Rv_SNP_alignment.from_graph.fasta.* raxml\n",
+    "\n",
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab