diff --git a/.gitignore b/.gitignore index 5f15816652b9e312b96c65b2446663e812c5f890..a0a8984108bab4c902ecf3a73bfa0fd28693fce6 100755 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ container/assemblySC.sif rulegraph.pdf resources/bakta_db facienda.md - +workshop/pggb_latest.sif diff --git a/notebooks/create_pangenome_graph.ipynb b/notebooks/create_pangenome_graph.ipynb deleted file mode 100644 index 8ec974a6a0b26207400cec6dec0ac44537da2cae..0000000000000000000000000000000000000000 --- a/notebooks/create_pangenome_graph.ipynb +++ /dev/null @@ -1,168 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create a pangenome graph with pggb\n", - "\n", - "pggb requires a single gzipped and indexed fasta file containing all assemblies. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install biopython\n", - "\n", - "If not already present, create a conda environment with biopython. \n", - "\n", - "```\n", - "conda create -n bio biopython\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then select the bio environent as the kernel for the Jupyter notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Combine assemblies\n", - "\n", - "The script below creates a file 'assemblies_combined.fasta' in which all assemblies listed in 'paths_to_assemblies.txt' are combined.\n", - "\n", - "'paths_to_assemblies.txt' should be a text file with one path per line. \n", - "\n", - "If one_contig_only is set to True, only single-contig assemblies are included." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from Bio import SeqIO\n", - "\n", - "\n", - "assemblies = 'paths_to_assemblies.txt'\n", - "one_contig_only = False\n", - "\n", - "fasta_combined = []\n", - "\n", - "with open(assemblies) as f:\n", - " \n", - " for line in f:\n", - " \n", - " assembly_path = line.strip()\n", - " strain = assembly_path.split('/')[-1].split('.')[0]\n", - " \n", - " records = [rec for rec in SeqIO.parse(assembly_path, \"fasta\")]\n", - " \n", - " if one_contig_only and len(records) > 1:\n", - " print('Discarded multi-contig assembly for ' + strain)\n", - " continue\n", - " \n", - " fasta_combined += records\n", - "\n", - "SeqIO.write(fasta_combined, 'assemblies_combined.fasta', 'fasta')\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Zip and index fasta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "%%bash\n", - "\n", - "gzip single_contig_assemblies.fasta\n", - "samtools faidx single_contig_assemblies.fasta.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run pggb\n", - "\n", - "Important paramters:\n", - "\n", - " -i : path to combined assemblies\n", - " -o : output directory name\n", - " -n : Number of contigs included\n", - " -p : Similarity of contigs\n", - " -s : Maximum length of repeats in the genome" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Example slurm script:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#!/bin/bash\n", - "\n", - "#SBATCH --cpus-per-task=20\n", - "#SBATCH --mem-per-cpu=4G\n", - "#SBATCH --time=24:00:00\n", - "#SBATCH --qos=1day\n", - "#SBATCH --output=stdout.%j\n", - "#SBATCH --error=stderr.%j\n", - "\n", - "singularity exec /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif pggb \\\n", - " -i assemblies_combined.fasta.gz \\\n", - " -o ./pggb \\\n", - " -m \\\n", - " -t 20 \\\n", - " -n 52 \\\n", - " -p 99 \\\n", - " -s 5k\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/workshop/A_create_graph.ipynb b/workshop/A_create_graph.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8d02ef63a5ee23d9a1069e20948989baf307e01c --- /dev/null +++ b/workshop/A_create_graph.ipynb @@ -0,0 +1,1184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a pangenome graph with PGGB\n", + "\n", + "PGGB requires as input a single fasta file that contains the genome assemblies. \n", + "\n", + "The file **/data/MTBC.L1.assemblies.fasta** contains all the contigs resulting from the assembly of the 19 L1 strains, while **/data/assembly_summaries.tsv** contains assembly metadata.\n", + "\n", + "\n", + "Starting from these two files, we do the following:\n", + " - load the metadata, select complete genomes\n", + " - write all complete genomes to a fasta\n", + "\n", + "â“ What do we mean with 'complete'?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Flye assembly summary" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample</th>\n", + " <th>#seq_name</th>\n", + " <th>length</th>\n", + " <th>cov.</th>\n", + " <th>circ.</th>\n", + " <th>repeat</th>\n", + " <th>mult.</th>\n", + " <th>alt_group</th>\n", + " <th>graph_path</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>PB000195</td>\n", + " <td>contig_1</td>\n", + " <td>4444823</td>\n", + " <td>114</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>PB000196</td>\n", + " <td>contig_10</td>\n", + " <td>4421359</td>\n", + " <td>133</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>10,41</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>PB000196</td>\n", + " <td>contig_47</td>\n", + " <td>13589</td>\n", + " <td>32</td>\n", + " <td>N</td>\n", + " <td>Y</td>\n", + " <td>1</td>\n", + " <td>48</td>\n", + " <td>*,47,*</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>PB000196</td>\n", + " <td>contig_1</td>\n", + " <td>10270</td>\n", + " <td>45</td>\n", + " <td>N</td>\n", + " <td>Y</td>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>*,1,*</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>PB000196</td>\n", + " <td>contig_42</td>\n", + " <td>1662</td>\n", + " <td>52</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>-41,42</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sample #seq_name length cov. circ. repeat mult. alt_group graph_path\n", + "0 PB000195 contig_1 4444823 114 Y N 1 * 1\n", + "1 PB000196 contig_10 4421359 133 N N 1 * 10,41\n", + "2 PB000196 contig_47 13589 32 N Y 1 48 *,47,*\n", + "3 PB000196 contig_1 10270 45 N Y 1 6 *,1,*\n", + "4 PB000196 contig_42 1662 52 N N 1 * -41,42" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas\n", + "\n", + "md = pandas.read_csv('data/assembly_summaries.tsv', sep='\\t')\n", + "md.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many sequences per strain?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sample\n", + "PB000205 49\n", + "PB000202 7\n", + "PB000196 4\n", + "PB000220 2\n", + "PB000198 2\n", + "PB000207 1\n", + "PB000219 1\n", + "PB000211 1\n", + "PB000210 1\n", + "PB000209 1\n", + "PB000208 1\n", + "PB000195 1\n", + "PB000206 1\n", + "PB000203 1\n", + "PB000201 1\n", + "PB000200 1\n", + "PB000199 1\n", + "PB000197 1\n", + "PB000204 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md['sample'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Which sequences are cirularized?" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample</th>\n", + " <th>#seq_name</th>\n", + " <th>length</th>\n", + " <th>cov.</th>\n", + " <th>circ.</th>\n", + " <th>repeat</th>\n", + " <th>mult.</th>\n", + " <th>alt_group</th>\n", + " <th>graph_path</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>PB000195</td>\n", + " <td>contig_1</td>\n", + " <td>4444823</td>\n", + " <td>114</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>PB000197</td>\n", + " <td>contig_1</td>\n", + " <td>4410932</td>\n", + " <td>129</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>PB000199</td>\n", + " <td>contig_1</td>\n", + " <td>4426248</td>\n", + " <td>112</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>PB000200</td>\n", + " <td>contig_1</td>\n", + " <td>4428581</td>\n", + " <td>84</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>PB000201</td>\n", + " <td>contig_1</td>\n", + " <td>4408479</td>\n", + " <td>96</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>PB000203</td>\n", + " <td>contig_1</td>\n", + " <td>4434008</td>\n", + " <td>47</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>PB000204</td>\n", + " <td>contig_1</td>\n", + " <td>4419776</td>\n", + " <td>74</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>PB000205</td>\n", + " <td>contig_50</td>\n", + " <td>4420883</td>\n", + " <td>37</td>\n", + " <td>Y</td>\n", + " <td>Y</td>\n", + " <td>2</td>\n", + " <td>*</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>69</th>\n", + " <td>PB000206</td>\n", + " <td>contig_1</td>\n", + " <td>4440794</td>\n", + " <td>119</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>70</th>\n", + " <td>PB000207</td>\n", + " <td>contig_1</td>\n", + " <td>4424460</td>\n", + " <td>85</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>71</th>\n", + " <td>PB000208</td>\n", + " <td>contig_1</td>\n", + " <td>4419922</td>\n", + " <td>119</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>72</th>\n", + " <td>PB000209</td>\n", + " <td>contig_1</td>\n", + " <td>4443791</td>\n", + " <td>80</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>73</th>\n", + " <td>PB000210</td>\n", + " <td>contig_1</td>\n", + " <td>4444571</td>\n", + " <td>115</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>74</th>\n", + " <td>PB000211</td>\n", + " <td>contig_1</td>\n", + " <td>4441994</td>\n", + " <td>120</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75</th>\n", + " <td>PB000219</td>\n", + " <td>contig_1</td>\n", + " <td>4410932</td>\n", + " <td>101</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sample #seq_name length cov. circ. repeat mult. alt_group \\\n", + "0 PB000195 contig_1 4444823 114 Y N 1 * \n", + "5 PB000197 contig_1 4410932 129 Y N 1 * \n", + "8 PB000199 contig_1 4426248 112 Y N 1 * \n", + "9 PB000200 contig_1 4428581 84 Y N 1 * \n", + "10 PB000201 contig_1 4408479 96 Y N 1 * \n", + "18 PB000203 contig_1 4434008 47 Y N 1 * \n", + "19 PB000204 contig_1 4419776 74 Y N 1 * \n", + "20 PB000205 contig_50 4420883 37 Y Y 2 * \n", + "69 PB000206 contig_1 4440794 119 Y N 1 * \n", + "70 PB000207 contig_1 4424460 85 Y N 1 * \n", + "71 PB000208 contig_1 4419922 119 Y N 1 * \n", + "72 PB000209 contig_1 4443791 80 Y N 1 * \n", + "73 PB000210 contig_1 4444571 115 Y N 1 * \n", + "74 PB000211 contig_1 4441994 120 Y N 1 * \n", + "75 PB000219 contig_1 4410932 101 Y N 1 * \n", + "\n", + " graph_path \n", + "0 1 \n", + "5 1 \n", + "8 1 \n", + "9 1 \n", + "10 1 \n", + "18 1 \n", + "19 1 \n", + "20 50 \n", + "69 1 \n", + "70 1 \n", + "71 1 \n", + "72 1 \n", + "73 1 \n", + "74 1 \n", + "75 1 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[md['circ.'] == 'Y']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sequences larger than 4 Mb" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample</th>\n", + " <th>#seq_name</th>\n", + " <th>length</th>\n", + " <th>cov.</th>\n", + " <th>circ.</th>\n", + " <th>repeat</th>\n", + " <th>mult.</th>\n", + " <th>alt_group</th>\n", + " <th>graph_path</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>PB000195</td>\n", + " <td>contig_1</td>\n", + " <td>4444823</td>\n", + " <td>114</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>PB000196</td>\n", + " <td>contig_10</td>\n", + " <td>4421359</td>\n", + " <td>133</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>10,41</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>PB000197</td>\n", + " <td>contig_1</td>\n", + " <td>4410932</td>\n", + " <td>129</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>PB000198</td>\n", + " <td>contig_1</td>\n", + " <td>4332805</td>\n", + " <td>86</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>2,1,2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>PB000199</td>\n", + " <td>contig_1</td>\n", + " <td>4426248</td>\n", + " <td>112</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>PB000200</td>\n", + " <td>contig_1</td>\n", + " <td>4428581</td>\n", + " <td>84</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>PB000201</td>\n", + " <td>contig_1</td>\n", + " <td>4408479</td>\n", + " <td>96</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>PB000203</td>\n", + " <td>contig_1</td>\n", + " <td>4434008</td>\n", + " <td>47</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>PB000204</td>\n", + " <td>contig_1</td>\n", + " <td>4419776</td>\n", + " <td>74</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>PB000205</td>\n", + " <td>contig_50</td>\n", + " <td>4420883</td>\n", + " <td>37</td>\n", + " <td>Y</td>\n", + " <td>Y</td>\n", + " <td>2</td>\n", + " <td>*</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>69</th>\n", + " <td>PB000206</td>\n", + " <td>contig_1</td>\n", + " <td>4440794</td>\n", + " <td>119</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>70</th>\n", + " <td>PB000207</td>\n", + " <td>contig_1</td>\n", + " <td>4424460</td>\n", + " <td>85</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>71</th>\n", + " <td>PB000208</td>\n", + " <td>contig_1</td>\n", + " <td>4419922</td>\n", + " <td>119</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>72</th>\n", + " <td>PB000209</td>\n", + " <td>contig_1</td>\n", + " <td>4443791</td>\n", + " <td>80</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>73</th>\n", + " <td>PB000210</td>\n", + " <td>contig_1</td>\n", + " <td>4444571</td>\n", + " <td>115</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>74</th>\n", + " <td>PB000211</td>\n", + " <td>contig_1</td>\n", + " <td>4441994</td>\n", + " <td>120</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75</th>\n", + " <td>PB000219</td>\n", + " <td>contig_1</td>\n", + " <td>4410932</td>\n", + " <td>101</td>\n", + " <td>Y</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>76</th>\n", + " <td>PB000220</td>\n", + " <td>contig_1</td>\n", + " <td>4319282</td>\n", + " <td>88</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>2,1,2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sample #seq_name length cov. circ. repeat mult. alt_group \\\n", + "0 PB000195 contig_1 4444823 114 Y N 1 * \n", + "1 PB000196 contig_10 4421359 133 N N 1 * \n", + "5 PB000197 contig_1 4410932 129 Y N 1 * \n", + "6 PB000198 contig_1 4332805 86 N N 1 * \n", + "8 PB000199 contig_1 4426248 112 Y N 1 * \n", + "9 PB000200 contig_1 4428581 84 Y N 1 * \n", + "10 PB000201 contig_1 4408479 96 Y N 1 * \n", + "18 PB000203 contig_1 4434008 47 Y N 1 * \n", + "19 PB000204 contig_1 4419776 74 Y N 1 * \n", + "20 PB000205 contig_50 4420883 37 Y Y 2 * \n", + "69 PB000206 contig_1 4440794 119 Y N 1 * \n", + "70 PB000207 contig_1 4424460 85 Y N 1 * \n", + "71 PB000208 contig_1 4419922 119 Y N 1 * \n", + "72 PB000209 contig_1 4443791 80 Y N 1 * \n", + "73 PB000210 contig_1 4444571 115 Y N 1 * \n", + "74 PB000211 contig_1 4441994 120 Y N 1 * \n", + "75 PB000219 contig_1 4410932 101 Y N 1 * \n", + "76 PB000220 contig_1 4319282 88 N N 1 * \n", + "\n", + " graph_path \n", + "0 1 \n", + "1 10,41 \n", + "5 1 \n", + "6 2,1,2 \n", + "8 1 \n", + "9 1 \n", + "10 1 \n", + "18 1 \n", + "19 1 \n", + "20 50 \n", + "69 1 \n", + "70 1 \n", + "71 1 \n", + "72 1 \n", + "73 1 \n", + "74 1 \n", + "75 1 \n", + "76 2,1,2 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[md['length']>4e6]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For which samples do we lack a circular genome?" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'PB000202', 'PB000220', 'PB000198', 'PB000196'}\n" + ] + } + ], + "source": [ + "samples_circ = md['sample'][md['circ.'] == 'Y']\n", + "samples_noncirc = md['sample'][md['sample'].isin(samples_circ) == False]\n", + "\n", + "print(set(samples_noncirc))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "â“ What is the matter with these genomes? \n", + "\n", + "â“ Should we include them in the graph?" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample</th>\n", + " <th>#seq_name</th>\n", + " <th>length</th>\n", + " <th>cov.</th>\n", + " <th>circ.</th>\n", + " <th>repeat</th>\n", + " <th>mult.</th>\n", + " <th>alt_group</th>\n", + " <th>graph_path</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>PB000196</td>\n", + " <td>contig_10</td>\n", + " <td>4421359</td>\n", + " <td>133</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>10,41</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>PB000196</td>\n", + " <td>contig_47</td>\n", + " <td>13589</td>\n", + " <td>32</td>\n", + " <td>N</td>\n", + " <td>Y</td>\n", + " <td>1</td>\n", + " <td>48</td>\n", + " <td>*,47,*</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>PB000196</td>\n", + " <td>contig_1</td>\n", + " <td>10270</td>\n", + " <td>45</td>\n", + " <td>N</td>\n", + " <td>Y</td>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>*,1,*</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>PB000196</td>\n", + " <td>contig_42</td>\n", + " <td>1662</td>\n", + " <td>52</td>\n", + " <td>N</td>\n", + " <td>N</td>\n", + " <td>1</td>\n", + " <td>*</td>\n", + " <td>-41,42</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sample #seq_name length cov. circ. repeat mult. alt_group graph_path\n", + "1 PB000196 contig_10 4421359 133 N N 1 * 10,41\n", + "2 PB000196 contig_47 13589 32 N Y 1 48 *,47,*\n", + "3 PB000196 contig_1 10270 45 N Y 1 6 *,1,*\n", + "4 PB000196 contig_42 1662 52 N N 1 * -41,42" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[md['sample'] == 'PB000196']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Combine complete genomes\n", + "\n", + "Let us be liberal and just use all sequences > 4 Mb. We can add a tag to the name of the 'unclean' ones.\n", + "\n", + "First a quick look at the fasta headers." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">PB000195_1\n", + ">PB000196_1\n", + ">PB000196_2\n", + ">PB000196_3\n", + ">PB000197_1\n", + ">PB000198_1\n", + ">PB000198_2\n", + ">PB000199_1\n", + ">PB000200_1\n", + ">PB000201_1\n", + ">PB000202_1\n", + ">PB000202_2\n", + ">PB000202_3\n", + ">PB000202_4\n", + ">PB000202_5\n", + ">PB000202_6\n", + ">PB000202_7\n", + ">PB000203_1\n", + ">PB000204_1\n", + ">PB000205_1\n", + ">PB000205_2\n", + ">PB000205_3\n", + ">PB000205_4\n", + ">PB000205_5\n", + ">PB000205_6\n", + ">PB000205_7\n", + ">PB000205_8\n", + ">PB000205_9\n", + ">PB000205_10\n", + ">PB000205_11\n", + ">PB000205_12\n", + ">PB000205_13\n", + ">PB000205_14\n", + ">PB000205_15\n", + ">PB000205_16\n", + ">PB000205_17\n", + ">PB000205_18\n", + ">PB000205_19\n", + ">PB000205_20\n", + ">PB000205_21\n", + ">PB000205_22\n", + ">PB000205_23\n", + ">PB000205_24\n", + ">PB000205_25\n", + ">PB000205_26\n", + ">PB000205_27\n", + ">PB000205_28\n", + ">PB000205_29\n", + ">PB000205_30\n", + ">PB000205_31\n", + ">PB000205_32\n", + ">PB000205_33\n", + ">PB000205_34\n", + ">PB000205_35\n", + ">PB000205_36\n", + ">PB000205_37\n", + ">PB000205_38\n", + ">PB000205_39\n", + ">PB000205_40\n", + ">PB000205_41\n", + ">PB000205_42\n", + ">PB000205_43\n", + ">PB000205_44\n", + ">PB000205_45\n", + ">PB000205_46\n", + ">PB000205_47\n", + ">PB000205_48\n", + ">PB000205_49\n", + ">PB000206_1\n", + ">PB000207_1\n", + ">PB000208_1\n", + ">PB000209_1\n", + ">PB000210_1\n", + ">PB000211_1\n", + ">PB000219_1\n", + ">PB000220_1\n", + ">PB000220_2\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "grep ^\\> data/MTBC.L1.assemblies.fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import SeqIO\n", + "\n", + "assemblies_all = 'data/MTBC.L1.assemblies.fasta'\n", + "assemblies_pggb = open('results/assemblies_4mb.fasta', 'w')\n", + "\n", + "# All sequences > 4Mb\n", + "md_4mb = md[md['length']>4e6]\n", + "\n", + "for rec in SeqIO.parse(assemblies_all, \"fasta\"):\n", + " \n", + " if len(rec.seq) < 4e6:\n", + " continue\n", + " \n", + " strain = rec.id.split('_')[0]\n", + " strain_circ = md_4mb['circ.'][md_4mb['sample'] == strain].to_string(index=False)\n", + " strain_repeat = md_4mb['repeat'][md_4mb['sample'] == strain].to_string(index=False) \n", + " \n", + " new_id = strain\n", + " if strain_circ == 'N':\n", + " new_id += '#circNo'\n", + " if strain_repeat == 'Y':\n", + " new_id += '#repeatY'\n", + " rec.id = new_id\n", + " rec.description = ''\n", + " SeqIO.write(rec, assemblies_pggb, 'fasta')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run PGGB!\n", + "\n", + "\n", + "\n", + "Important paramters:\n", + "\n", + " -i : path to combined assemblies\n", + " -o : output directory name\n", + " -n : Number of contigs included\n", + " -p : Similarity of contigs\n", + " -s : Maximum length of repeats in the genome" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create slurm script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "echo '\\\n", + "#!/bin/bash\n", + "\n", + "#SBATCH --cpus-per-task=20\n", + "#SBATCH --mem-per-cpu=1G\n", + "#SBATCH --time=06:00:00\n", + "#SBATCH --qos=6hours\n", + "#SBATCH --output=stdout.%j\n", + "#SBATCH --error=stderr.%j\n", + "\n", + "singularity exec /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif pggb \\\n", + " -i assemblies_combined.fasta.gz \\\n", + " -o ./pggb \\\n", + " -m \\\n", + " -t 20 \\\n", + " -n 52 \\\n", + " -p 99 \\\n", + " -s 5k\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variant calling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "REF=\n", + "\n", + "/scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif vg deconstruct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualization" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/workshop/B_call_variants.ipynb b/workshop/B_call_variants.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0476c68dc1acb47f4485a306959a85fc1c081c54 --- /dev/null +++ b/workshop/B_call_variants.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "REF=\n", + "\n", + "/scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/pggb_latest.sif vg deconstruct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "singularity exec /home/cristobal/programs/pggb_latest.sif vg deconstruct \\\n", + " pggb/assemblies_combined.fasta.gz.d71a954.11fba48.a4da63a.smooth.final.gfa -d1 -e \\\n", + " -p H37Rv \\\n", + " -t 4 \\\n", + " --all-snarls \\\n", + " > variants.vcf\n", + "\n", + "grep -v -c ^# variants.vcf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://github.com/pangenome/resolve-nested-genotypes" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tree_from_graph.ipynb b/workshop/C_core_genome_alignment.ipynb similarity index 100% rename from notebooks/tree_from_graph.ipynb rename to workshop/C_core_genome_alignment.ipynb diff --git a/workshop/D_graph_annotation.ipynb b/workshop/D_graph_annotation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workshop/E_gene_conversion.ipynb b/workshop/E_gene_conversion.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workshop/README.md b/workshop/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c96d3aa6369f17750d7525064a19ff653769de40 --- /dev/null +++ b/workshop/README.md @@ -0,0 +1,71 @@ +# Workshop: Long-read genome assembly and pangenome graph analysis +Swiss TPH, 5.11.2024 + +The aim of this workshop is to get familiar with the pangenome graph framework, + +It consists of a series of Jupyter notebooks in which we progress from creating + +This workshop consists of a collection of Jupyter notebooks that + + +* [Set up](#set-up) +* [The data](#the-data) +* [A) Create a graph and visualize it](#a-create-a-graph--visualize) +* [B) Call variants from a graph](#b-call-variants-from-a-graph) +* [C) Core genome alignment](#c-core-genome-alignment) +* [D) Graph annotation & liftovers](#d-graph-annotation--liftovers) +* [E) SV categorization](#e-categorize-svs-into-insertions-and-deletions) +* [F) Identify gene conversion events](#f-identify-gene-conversion-events) + + +## Set up +This workshop is based on Jupyiter notebooks, from where we run Python and bash code. You can run the notebooks from the sciCORE open-on-demand platform (http://ood-ubuntu.scicore.unibas.ch/) or from a editor like Visual Studio Code. + +We are going to use [PGGB](https://pggb.readthedocs.io/en/latest/) for creating analyzing pangenome graphs, a tool set developed by the human pangenome consortium. See [Yang et al. 2023](https://doi.org/10.3389/fgene.2023.1225248) for an evaluation of PGGB with bacterial genomes. In addition, a few Python packages need to be installed, as explained below. Please try to do at least steps 1 and 2 below **before** the workshop, especially step 2 might take some time. + + +Step 1: clone the repository from gitlab. +``` +git clone https://git.scicore.unibas.ch/TBRU/PacbioSnake +``` + + +Step 2: Pull the PGGB container (2Gb, might take a while...) +``` +cd PacbioSnake +singularity pull docker://ghcr.io/pangenome/pggb:latest +``` + + +Step 3: Install python packages (pandas and biopython). Ideally we create a new conda environment, which can then be selected as the kernel for running the notebooks. +``` +conda create -n genomegraphs pandas biopython +``` + + +## The data +The data we will explore in this workshop are 19 genomes that have been assembled from PacBio HiFi reads by Venus for her PhD project. All should be *Mycobacterium tuberculosis* strains belonging to lineage 1, and we expect them to be fairly diverse because they have been selected from across the diverse L1 clade. These genomes have neither been published nor thoroughly analyzed, so genuine discoveries are possible during the workshop! + + +## A. Create a graph & visualize it +Given a set of assemblies and some assembly metadata, we select assemblies, rename them, and write them to a single fasta file. This is the only mandatory input for PGGB. We also explore the effect of two key parameters of PGGB, the minimum pairwise identity between seeds (-p) and the seed length (-s). + + +## B. Call variants from a graph +In this part, we obtain variants in classic vcf format from the graph, using an arbitrary reference assembly. A summary is created of the indels and SVs, and the complication of nested SVs is explored. + + +## C. Core genome alignment +To make sense of genetic variation we need a phylogenetic tree. Here we traverse the graph and extract SNPs in nodes that are s in single strains. hared by all assemblies (i.e. the core genome). These SNPs are used to create an alignment and to estimate a tree. + + +## D. Graph annotation & liftovers +In this part we explore variants in genes and regions of interest, making use of the lift-over functionalities of PGGB. These allow to translate positions in one genome to any other genome in the graph. + + +## E. Categorize SVs into insertions and deletions +To tell whether a sequence missing in genome A reflects a deletion in an ancestor of A or an insertion elsewhere requires information about this sequence in more than two strains. Here we assess the frequency of he structural variant and its presence/absence in an outgroup strain in order to distinguish insertions from deletions. + + +## F. Identify gene conversion events +Gene conversion occurs through recombination between close paralogs and can result in heavily mutated genes. Gene conversion in the MTBC affects the functionally interesting PE/PPE gene families. Furthermore, treating SNPs introduced throug gene conversion as point mutations can bias various downstream analyses. Here we identify gene conversion events, which give themselves away as variant hotspots in single strains. diff --git a/workshop/data/MTBC.L1.assemblies.fasta b/workshop/data/MTBC.L1.assemblies.fasta new file mode 100644 index 0000000000000000000000000000000000000000..baeb7faa075416a09d948c0c3394716193722ba0 Binary files /dev/null and b/workshop/data/MTBC.L1.assemblies.fasta differ diff --git a/workshop/data/assembly_summaries.tsv b/workshop/data/assembly_summaries.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5469b0ac9275f8d65720eacbb4537f4933c14735 --- /dev/null +++ b/workshop/data/assembly_summaries.tsv @@ -0,0 +1,79 @@ +sample #seq_name length cov. circ. repeat mult. alt_group graph_path +PB000195 contig_1 4444823 114 Y N 1 * 1 +PB000196 contig_10 4421359 133 N N 1 * 10,41 +PB000196 contig_47 13589 32 N Y 1 48 *,47,* +PB000196 contig_1 10270 45 N Y 1 6 *,1,* +PB000196 contig_42 1662 52 N N 1 * -41,42 +PB000197 contig_1 4410932 129 Y N 1 * 1 +PB000198 contig_1 4332805 86 N N 1 * 2,1,2 +PB000198 contig_2 110357 171 N Y 2 * 2 +PB000199 contig_1 4426248 112 Y N 1 * 1 +PB000200 contig_1 4428581 84 Y N 1 * 1 +PB000201 contig_1 4408479 96 Y N 1 * 1 +PB000202 contig_1 1530994 13 N N 1 * *,1,* +PB000202 contig_6 1184280 13 N N 1 * *,6,* +PB000202 contig_8 1077139 14 N N 1 * *,8,* +PB000202 contig_9 434170 13 N N 1 * *,9,* +PB000202 contig_7 100047 13 N N 1 * *,7,* +PB000202 contig_10 66133 13 N N 1 * *,10,* +PB000202 contig_5 33682 14 N N 1 * *,5,* +PB000203 contig_1 4434008 47 Y N 1 * 1 +PB000204 contig_1 4419776 74 Y N 1 * 1 +PB000205 contig_50 4420883 37 Y Y 2 * 50 +PB000205 contig_49 216232 6 N Y 1 * *,49,* +PB000205 contig_53 163344 6 N Y 1 * *,53,* +PB000205 contig_51 142109 6 N Y 1 * *,51,* +PB000205 contig_9 131264 6 N Y 1 * *,9,* +PB000205 contig_8 120360 6 N Y 1 * *,8,* +PB000205 contig_37 117735 6 N Y 1 * *,37,* +PB000205 contig_2 117435 7 N Y 1 * *,2,* +PB000205 contig_33 104292 6 N Y 1 * *,33,* +PB000205 contig_44 102971 6 N Y 1 * *,44,* +PB000205 contig_31 93943 5 N Y 1 * *,31,* +PB000205 contig_6 89619 5 N Y 1 * *,6,* +PB000205 contig_5 87578 6 N Y 1 * *,5,* +PB000205 contig_23 85152 5 N Y 1 * *,23,* +PB000205 contig_13 84548 5 N Y 1 * *,13,* +PB000205 contig_19 76959 5 N Y 1 * *,19,* +PB000205 contig_54 74739 5 N Y 1 * *,54,* +PB000205 contig_52 72581 5 N Y 1 * *,52,* +PB000205 contig_21 69776 6 N Y 1 * *,21,* +PB000205 contig_20 68663 5 N Y 1 * *,20,* +PB000205 contig_22 68060 6 N Y 1 * *,22,* +PB000205 contig_15 67068 6 N Y 1 * *,15,* +PB000205 contig_25 66397 7 N Y 1 * *,25,* +PB000205 contig_12 55613 7 N Y 1 * *,12,* +PB000205 contig_18 52108 5 N Y 1 * *,18,* +PB000205 contig_17 51926 6 N Y 1 * *,17,* +PB000205 contig_1 47711 6 N Y 1 * *,1,* +PB000205 contig_48 45758 6 N Y 1 * *,48,* +PB000205 contig_42 45451 5 N Y 1 * *,42,* +PB000205 contig_38 45013 6 N Y 1 * *,38,* +PB000205 contig_43 41645 5 N Y 1 * *,43,* +PB000205 contig_34 39550 6 N Y 1 * *,34,* +PB000205 contig_26 38539 7 N Y 1 * *,26,* +PB000205 contig_30 37006 7 N Y 1 * *,30,* +PB000205 contig_47 35474 6 N Y 1 * *,47,* +PB000205 contig_28 33249 7 N Y 1 * *,28,* +PB000205 contig_32 33146 5 N Y 1 * *,32,* +PB000205 contig_41 32796 5 N Y 1 * *,41,* +PB000205 contig_3 31924 7 N Y 1 * *,3,* +PB000205 contig_40 31087 6 N Y 1 * *,40,* +PB000205 contig_16 28303 6 N Y 1 * *,16,* +PB000205 contig_36 26152 6 N Y 1 * *,36,* +PB000205 contig_46 24509 6 N Y 1 * *,46,* +PB000205 contig_45 19766 6 N Y 1 * *,45,* +PB000205 contig_14 15999 5 N Y 1 * *,14,* +PB000205 contig_24 15160 8 N Y 1 * *,24,* +PB000205 contig_29 13880 6 N Y 1 * *,29,* +PB000205 contig_39 12605 8 N Y 1 * *,39,* +PB000205 contig_4 11914 6 N Y 1 * *,4,* +PB000206 contig_1 4440794 119 Y N 1 * 1 +PB000207 contig_1 4424460 85 Y N 1 * 1 +PB000208 contig_1 4419922 119 Y N 1 * 1 +PB000209 contig_1 4443791 80 Y N 1 * 1 +PB000210 contig_1 4444571 115 Y N 1 * 1 +PB000211 contig_1 4441994 120 Y N 1 * 1 +PB000219 contig_1 4410932 101 Y N 1 * 1 +PB000220 contig_1 4319282 88 N N 1 * 2,1,2 +PB000220 contig_2 110354 166 N Y 2 * 2 diff --git a/workshop/data/read_summaries.tsv b/workshop/data/read_summaries.tsv new file mode 100644 index 0000000000000000000000000000000000000000..7249a255173ccbd030feda0e5c7d89230479d34b --- /dev/null +++ b/workshop/data/read_summaries.tsv @@ -0,0 +1,20 @@ +sample Yield Q7 bases Longest_read Num_of_reads Mean_read_length N50_read_length Mean_GC_content SD_GC_content Estimated non-sense read fraction Mean_coverage SD_coverage Estimated crude Xome size Num_of_trimmed_reads_5 Max_identity_adp5 Average_position_from_5_end +PB000195 496838122 99.90% 22822 96525 5147.248091168091 6095.0 0.6551989316940308 0.02350557968020439 0.0 106.6074447950661 6.515689275641003 4660444 (e = 0.0%) NA NA NA +PB000196 577922899 99.90% 22590 111741 5171.986101788958 6024.0 0.6549097299575806 0.023503180593252182 0.0 121.68457644074489 8.002560154467622 4749352 (e = 0.0%) NA NA NA +PB000197 561009752 99.89% 38252 110879 5059.657392292499 5845.0 0.6560201644897461 0.02437005750834942 0.0 124.07422861763345 6.4293378693676155 4521565 (e = 0.0%) 1 0.8478260869565217 55.0 +PB000198 383992338 99.90% 38782 79144 4851.818684928738 5651.0 0.6548930406570435 0.025800971314311028 0.0 72.55553984530178 6.5554600268589365 5292391 (e = 0.0%) NA NA NA +PB000199 493951865 99.91% 20156 106489 4638.524777207035 5299.0 0.654647707939148 0.023217422887682915 0.0 93.66784531033443 9.19962202133822 5273441 (e = 0.0%) NA NA NA +PB000200 363727687 99.90% 22639 74718 4868.006196632672 5659.0 0.6544701457023621 0.024078119546175003 0.0 78.77961877044588 5.397476074822339 4617027 (e = 0.0%) NA NA NA +PB000201 419391636 99.90% 32959 89432 4689.503041416942 5340.0 0.6566223502159119 0.0249309204518795 0.0 93.7500460347495 6.375166080751846 4473508 (e = 0.0%) NA NA NA +PB000202 57934854 99.92% 22459 16180 3580.6461063040792 3953.0 0.659533679485321 0.027512168511748314 0.0002 14.039143522362133 1.8212913989188695 4125840 (e = 0.0%) NA NA NA +PB000203 206235793 99.90% 36492 44619 4622.151841143907 5304.0 0.6563654541969299 0.025541380047798157 0.0 46.51565870214725 3.522865285320088 4433685 (e = 0.0%) NA NA NA +PB000204 323460510 99.91% 22350 75398 4290.04098251943 4905.0 0.6549028158187866 0.024565229192376137 0.0 71.49859846690619 4.607852010001684 4524011 (e = 0.0%) NA NA NA +PB000205 184355678 99.90% 36335 42724 4315.037870985862 4882.0 0.6576676368713379 0.02602369524538517 0.0028 33.33687157434451 4.595978327212487 5514599 (e = 0.3%) NA NA NA +PB000206 515555342 99.90% 22956 98268 5246.421439329181 6221.0 0.6553107500076294 0.02373949997127056 0.0 111.32085766246837 6.980499520212127 4631255 (e = 0.0%) NA NA NA +PB000207 371607027 99.90% 29108 68207 5448.224185200933 6405.0 0.6548841595649719 0.023478785529732704 0.0 79.52935113256152 5.248578702547423 4672577 (e = 0.0%) NA NA NA +PB000208 517060722 99.90% 21247 99789 5181.540269969636 6096.0 0.6554038524627686 0.023686150088906288 0.0002 113.38246882208041 6.8755649188101495 4559411 (e = 0.0%) NA NA NA +PB000209 351043474 99.90% 23071 66955 5242.976237771638 6153.0 0.6550890803337097 0.023656221106648445 0.0002 75.2347457453207 5.193755501084815 4665042 (e = 0.0%) NA NA NA +PB000210 501693130 99.89% 23118 84353 5947.543418728439 6800.0 0.6554515957832336 0.022847512736916542 0.0 105.41636152332916 6.881157735196752 4759158 (e = 0.0%) NA NA NA +PB000211 529662039 99.89% 26563 89427 5922.842530779295 7181.0 0.6550024151802063 0.0231816116720438 0.0 112.72149266444393 7.720437008678516 4698855 (e = 0.0%) NA NA NA +PB000219 436604862 99.91% 21045 89326 4887.769092985245 5561.0 0.6548299193382263 0.02320525422692299 0.0 94.74941415110527 6.01532942153573 4607995 (e = 0.0%) NA NA NA +PB000220 393141481 99.90% 20088 81970 4796.162998658046 5586.0 0.6545571684837341 0.024594135582447052 0.0002 73.24541012773749 6.599511848402256 5366382 (e = 0.0%) NA NA NA diff --git a/workshop/pics/is6110_ppe.png b/workshop/pics/is6110_ppe.png new file mode 100755 index 0000000000000000000000000000000000000000..f5048981f7c4c4638a1cbd141cc58cf4ffd6c968 Binary files /dev/null and b/workshop/pics/is6110_ppe.png differ diff --git a/workshop/pics/pggb-flow-diagram.png b/workshop/pics/pggb-flow-diagram.png new file mode 100644 index 0000000000000000000000000000000000000000..38c5e59fac9c6d7aa0eddb7f129dd0bc8630e96d Binary files /dev/null and b/workshop/pics/pggb-flow-diagram.png differ