diff --git a/notebooks/create_pangenome_graph.ipynb b/notebooks/create_pangenome_graph.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8ec974a6a0b26207400cec6dec0ac44537da2cae --- /dev/null +++ b/notebooks/create_pangenome_graph.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a pangenome graph with pggb\n", + "\n", + "pggb requires a single gzipped and indexed fasta file containing all assemblies. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install biopython\n", + "\n", + "If not already present, create a conda environment with biopython. \n", + "\n", + "```\n", + "conda create -n bio biopython\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then select the bio environent as the kernel for the Jupyter notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combine assemblies\n", + "\n", + "The script below creates a file 'assemblies_combined.fasta' in which all assemblies listed in 'paths_to_assemblies.txt' are combined.\n", + "\n", + "'paths_to_assemblies.txt' should be a text file with one path per line. \n", + "\n", + "If one_contig_only is set to True, only single-contig assemblies are included." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "\n", + "\n", + "assemblies = 'paths_to_assemblies.txt'\n", + "one_contig_only = False\n", + "\n", + "fasta_combined = []\n", + "\n", + "with open(assemblies) as f:\n", + " \n", + " for line in f:\n", + " \n", + " assembly_path = line.strip()\n", + " strain = assembly_path.split('/')[-1].split('.')[0]\n", + " \n", + " records = [rec for rec in SeqIO.parse(assembly_path, \"fasta\")]\n", + " \n", + " if one_contig_only and len(records) > 1:\n", + " print('Discarded multi-contig assembly for ' + strain)\n", + " continue\n", + " \n", + " fasta_combined += records\n", + "\n", + "SeqIO.write(fasta_combined, 'assemblies_combined.fasta', 'fasta')\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zip and index fasta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "gzip single_contig_assemblies.fasta\n", + "samtools faidx single_contig_assemblies.fasta.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run pggb\n", + "\n", + "Important paramters:\n", + "\n", + " -i : path to combined assemblies\n", + " -o : output directory name\n", + " -n : Number of contigs included\n", + " -p : Similarity of contigs\n", + " -s : Maximum length of repeats in the genome" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example slurm script:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#!/bin/bash\n", + "\n", + "#SBATCH --cpus-per-task=20\n", + "#SBATCH --mem-per-cpu=4G\n", + "#SBATCH --time=24:00:00\n", + "#SBATCH --qos=1day\n", + "#SBATCH --output=stdout.%j\n", + "#SBATCH --error=stderr.%j\n", + "\n", + "singularity exec /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif pggb \\\n", + " -i assemblies_combined.fasta.gz \\\n", + " -o ./pggb \\\n", + " -m \\\n", + " -t 20 \\\n", + " -n 52 \\\n", + " -p 99 \\\n", + " -s 5k\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/pangenome_graph.ipynb b/notebooks/pangenome_graph.ipynb deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/notebooks/tree_from_graph.ipynb b/notebooks/tree_from_graph.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b3052943781b741185276bd090f1b7a182ef2874 --- /dev/null +++ b/notebooks/tree_from_graph.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a phylogenetic tree from a pangenome graph\n", + "\n", + "Call SNPs from graph with vg deconstruct.\n", + "\n", + "Filter out resistance genes and clustered variants indicative of gene conversion or selection acting on sloppy targets.\n", + "\n", + "Maybe determine clustering threshold through simulations?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "singularity exec /home/cristobal/programs/pggb_latest.sif vg deconstruct \\\n", + " pggb/assemblies_combined.fasta.gz.d71a954.11fba48.a4da63a.smooth.final.gfa -d1 -e \\\n", + " -p H37Rv \\\n", + " -t 4 \\\n", + " --all-snarls \\\n", + " > variants.vcf\n", + "\n", + "grep -v -c ^# variants.vcf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create list of positions to exclude\n", + "import pandas\n", + "\n", + "dr_loci = pandas.read_csv(\"DR_genes_coordinates.tsv\", sep=\"\\t\")\n", + "repeats = pandas.read_csv(\"nucmer/delta.out.filtered.tsv\")\n", + "\n", + "excluded = set()\n", + "\n", + "for i, gene in dr_loci.iterrows():\n", + " excluded.update(range(gene[\"start_region\"], gene[\"stop_region\"]+1))\n", + "\n", + "for i,hpair in repeats.iterrows():\n", + " excluded.update(range(hpair[\"S1\"], hpair[\"E1\"]+1))\n", + " excluded.update(range(hpair[\"S2\"], hpair[\"E2\"]+1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "canettii = [\"CIPT_140010059_/_STB-A\", \"ET1291\"]\n", + "\n", + "keep = []\n", + "\n", + "discarded = {\n", + " \"In repeat or DR locus\" : 0,\n", + " \"Not SNP\" : 0,\n", + " \"canettii only\" : 0,\n", + " \"Missing genotypes\" : 0,\n", + " \"Invariant\" : 0\n", + "}\n", + "\n", + "with open(\"variants.vcf\") as f:\n", + " \n", + " for line in f:\n", + " \n", + " if line.startswith(\"#CHROM\"):\n", + " \n", + " fields = line.strip().split(\"\\t\")\n", + " strains = fields[9:]\n", + " canettii_i = [strains.index(x) for x in canettii]\n", + " \n", + " # Dictionary to store SNPs for each strain\n", + " strain_seqs = {strain : \"\" for strain in strains}\n", + " strain_seqs[\"H37Rv\"] = \"\"\n", + " continue\n", + " \n", + " elif line.startswith(\"#\"):\n", + " continue\n", + "\n", + " fields = line.strip().split(\"\\t\")\n", + " position = int(fields[1])\n", + " \n", + " ref = fields[3]\n", + " alt = fields[4].split(\",\")\n", + " \n", + " gt = fields[9:]\n", + " \n", + " # Two entries weirdly have missing genotypes. Skip them\n", + " if len(gt) != len(strains):\n", + " print(fields)\n", + " continue\n", + " \n", + " \n", + " canettii_gt = [gt[i] for i in canettii_i]\n", + " others = [gt[i] for i in range(len(gt)) if i not in canettii_i]\n", + " \n", + " \n", + " # Exclude sites in repeats and DR loci\n", + " if position in excluded:\n", + " discarded[\"In repeat or DR locus\"] += 1\n", + " continue\n", + "\n", + " # Only SNPs\n", + " if len(ref) > 1 or any(len(x) > 1 for x in alt):\n", + " discarded[\"Not SNP\"] += 1\n", + " continue\n", + " \n", + " # Skip canettii-only variants\n", + " if \"1\" in canettii_gt and \"1\" not in others:\n", + " discarded[\"canettii only\"] += 1\n", + " continue\n", + "\n", + " # Exclude missing genotypes\n", + " if any(x in gt for x in [\"\", \".\"]):\n", + " discarded[\"Missing genotypes\"] += 1\n", + " #continue\n", + "\n", + "\n", + " # Write alleles to site\n", + " alleles = [ref] + alt \n", + " site = ''\n", + " \n", + " for strain in strains:\n", + " strain_i = strains.index(strain)\n", + " strain_gt = gt[strain_i]\n", + " if not strain_gt or strain_gt not in '012345':\n", + " strain_allele = '-'\n", + " else:\n", + " strain_allele = alleles[int(strain_gt)]\n", + " \n", + " site += strain_allele\n", + " \n", + " # Re-check if there are remaining invariant sites\n", + " bases_only = ''\n", + " for allele in site:\n", + " if allele in ['A','C','G', 'T']:\n", + " bases_only += allele\n", + "\n", + " #bases_only = ref + site.replace('-', '')\n", + " \n", + " \n", + " if len(set(bases_only)) > 1:\n", + " \n", + " for i, allele in enumerate(site):\n", + " strain = strains[i]\n", + " strain_seqs[strain] += allele\n", + " \n", + " # Write H37Rv allele\n", + " strain_seqs[\"H37Rv\"] += ref\n", + " \n", + " keep.append((position, alleles, gt))\n", + " \n", + " else:\n", + " discarded[\"Invariant\"] += 1\n", + "\n", + " \n", + " \n", + "print(\"Filtered out:\")\n", + "for k in discarded:\n", + " print(k, str(discarded[k]))\n", + " \n", + "print(\"Kept: \", str(len(keep)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Write variable alignment to fasta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "from Bio.Seq import Seq\n", + "from Bio.SeqRecord import SeqRecord\n", + "\n", + "records = []\n", + "\n", + "for strain in [\"H37Rv\"] + strains:\n", + " \n", + " sequence = Seq(strain_seqs[strain])\n", + " record = SeqRecord(sequence, id=strain, name=\"\", description=\"\")\n", + " records.append(record)\n", + " \n", + "SeqIO.write(records, \"H37Rv_SNP_alignment.from_graph.fasta\", \"fasta\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count non-variable positions\n", + "\n", + "Get the core nodes from the graph and count the number of A,C,G,T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "GRAPH='/home/cristobal/TB/projects/deletionism/NOTEBOOKS/A_ancestral_reference/pggb/assemblies_combined.fasta.gz.d71a954.11fba48.a4da63a.smooth.final.gfa'\n", + "\n", + "n_paths = 0\n", + "\n", + "node_seqs = {}\n", + "node_count = {}\n", + "\n", + "# First traversal: get nodes present in all paths\n", + "with open(GRAPH) as f:\n", + " for line in f:\n", + " \n", + " fields = line.strip().split('\\t')\n", + " \n", + " if line.startswith('S'):\n", + " node_seqs[fields[1]] = fields[2]\n", + " \n", + " elif line.startswith('P'):\n", + " \n", + " n_paths += 1\n", + " \n", + " paff = [x[:-1] for x in fields[2].split(',') ]\n", + " \n", + " for node in paff: \n", + " \n", + " if node not in node_count:\n", + " node_count[node] = 0\n", + " \n", + " node_count[node] += 1\n", + " \n", + "invariable_seq = ''\n", + "\n", + "for node in node_count:\n", + " if node_count[node] == n_paths:\n", + " invariable_seq += node_seqs[node]\n", + " \n", + "base_counts = Counter(invariable_seq)\n", + "\n", + "for base in base_counts:\n", + " print(base, base_counts[base])\n", + "\n", + "\n", + "total = sum([base_counts[base] for base in base_counts])\n", + "gc = base_counts['G'] + base_counts['C']\n", + "\n", + "print('Total invariable bases: ', total)\n", + "print('GC content: ', gc/total)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Estimate tree!\n", + "\n", + "Use Stamatakis model in RAXML to account for invariable sites and correct branch lengths.\n", + "\n", + "--model GTR+G+ASC_STAM{633511/1206483/1214591/640523}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "~/programs/raxml-ng_v1.2.0_linux_x86_64/raxml-ng \\\n", + " --msa H37Rv_SNP_alignment.from_graph.fasta \\\n", + " --all --model GTR+G+ASC_STAM{633312/1206359/1214398/640320} --tree pars{10} --bs-trees 100 --threads 4 --outgroup CIPT_140010059_/_STB-A\n", + "\n", + "mv H37Rv_SNP_alignment.from_graph.fasta.* raxml\n", + "\n", + " " + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}