Skip to content
Snippets Groups Projects
Commit 3955adf0 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
*.pyc
venv/
.cache/
tacos_env/
tests_data/
tacos.egg-info
.pytest_cache
.idea/
language: python
python:
- "3.5"
- "3.6"
# command to install dependencies
install:
- pip install -r requirements.txt
# command to run tests
script:
- pytest tests
LICENSE 0 → 100644
Apache Software License 2.0
Copyright (c) 2019, Monica Ticlla
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# Tacos
Tacos is a diverse Set Of Commands And Tools (SOCAT, but TACOS sounds much better!).
## Basic setup
Create virtual environment
```
$ conda update conda
$ conda create --prefix=tacos_env -c intel intelpython3_core python=3.5 icu=58 pandas
$ conda activate ./tacos_env
$ conda install -c conda-forge python-igraph
```
Install Tacos:
```
$ git clone $
$ pip install --editable .
```
Run the application:
```
$ tacos --help
```
To run the tests:
```
$ pytest tests
```
Tacos.py 0 → 100644
import click
import numpy as np
import pandas as pd
import itertools
import multiprocessing as mp
import igraph
def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False):
"""Return the Hamming distance between two given sequence of characters.
Sequences must have equal length.
Args:
seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
missing: Character representing missing value in the sequence.
adjust: Boolean, default is False for no adjusment accounting for missing values.
Returns:
Int, absolute sequence difference after excluding positions with missing values.
"""
# Raises exception if vectors do not have equal length
try:
# get positions excluding missing values
if ignore_gap:
seq1_keep = ~(np.in1d(seq1,[missing,gap]))
seq2_keep = ~(np.in1d(seq2,[missing,gap]))
#seq1_keep = (seq1 != missing) & (seq1 != gap)
#seq2_keep = (seq2 != missing) & (seq2 != gap)
else:
seq1_keep = (seq1 != missing)
seq2_keep = (seq2 != missing)
to_keep = (seq1_keep & seq2_keep)
except ValueError:
print("seq1 and seq2 must have equal lengths")
else:
# drop positions with missing values and
# compute Hamming distance
# np.sum() instead of sum() improves speed considerably!
new_seq1 = seq1[to_keep]
new_seq2 = seq2[to_keep]
pair_dist = np.sum(new_seq1 != new_seq2)
# Adjust distance to account for missing values
if adjust:
seq1_effective_len = np.sum(seq1_keep)
seq2_effective_len = np.sum(seq2_keep)
pair_dist = len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len))
return (pair_dist)
@click.group()
def tacos():
pass
@tacos.command()
def genetic_distance():
'''Computes the genetic distance between genomes gi and gj, measured by the Hamming distance
(i.e number of substitutions needed to convert gj into gi)
'''
click.echo('compute_distances cmd1')
@tacos.command()
@click.option('-na','--missing', 'missing', default='X', show_default=True, help='Character for missing value')
@click.option('--gap', 'gap', default='-', show_default=True, help='Character to flag a position as indel')
@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
@click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values')
@click.option('--ncores', 'ncores', default=1, show_default=True, help='specify number of processes for parallel computing')
@click.argument('INPUT_', required=True, type=click.Path(exists=True))
@click.argument('OUTPUT', required=True, type=click.Path())
def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output):
'''Computes the genetic distance between each genome pair in INPUT. Distances are measured as
the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance).
INPUT is the path to the file with variable positions (rows) and genomes (columns)
'''
INPUT_file = input_
positions_genomes = pd.read_csv(INPUT_file, sep='\t', index_col=False)
nr_genomes = len(positions_genomes.columns[3:])
genome_pairs = list(itertools.combinations(positions_genomes.columns[3:],2))
if ncores>1:
click.echo('Starting pool with {} processes ...'.format(ncores))
pool = mp.Pool(processes=4)
distance_mp = [(g1,g2,pool.apply_async(my_hamming,
args=(positions_genomes.get(g1).values,
positions_genomes.get(g2).values,
missing, gap,ignore_gap, adjust)))
for g1,g2 in genome_pairs]
pool.close()
pool.join()
pair_distances = [(g1,g2, p.get()) for g1,g2,p in distance_mp]
del distance_mp
else:
pair_distances = [(g1, g2, my_hamming(positions_genomes.get(g1).values,
positions_genomes.get(g2).values,
missing, gap, ignore_gap, adjust))
for g1,g2 in genome_pairs]
del positions_genomes
genomes_graph=igraph.Graph.TupleList(pair_distances, weights=True)
del pair_distances
genomes_names = genomes_graph.vs['name']
genomes_adj=genomes_graph.get_adjacency(type=2,attribute='weight')
del genomes_graph
with click.open_file(output, 'w') as f:
f.write('\t'.join(['']+genomes_names))
f.write('\n')
for row_ix in range(genomes_adj.shape[0]):
f.write('\t'.join(map(str,[genomes_names[row_ix]]+genomes_adj[row_ix])))
f.write('\n')
#genomes_graph.write_adjacency(output, sep='\t', attribute='weight')
#with click.open_file(output, 'w') as f:
# f.write('\n'.join('%s %s %f' % item for item in pair_distances))
click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes))
if __name__ == '__main__':
tacos()
\ No newline at end of file
# Run pip install --requirement=requirements.txt to install all requirements
click==6.7
py==1.4.34
pytest==3.2.3
setup.py 0 → 100644
from setuptools import setup
setup(
name='tacos',
version='0.1',
py_modules=['Tacos'],
install_requires=[
'Click',
'numpy',
'pandas',
'python-igraph'
],
entry_points='''
[console_scripts]
tacos=Tacos:tacos
''',
)
\ No newline at end of file
def test_dummy():
assert True
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment