diff --git a/README.md b/README.md
index 88e2f7d987f31c417dc9691d7d547b24c4def939..34c8f78b6f888556f3004e84141919f4b05fec56 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,33 @@
 # TACOS
 
-TACOS is a diverse Set Of Commands And Tools (SOCAT, but TACOS sounds much better!).
+TACOS is a diverse Set Of Commands And Tools (SOCAT, but TACOS sounds much better!) to handle genomic-derived data.
 
 ## Basic setup
 Create virtual environment
 ```
 $ conda update conda
-$ conda create --prefix=<path/tacos_env> -c intel intelpython3_core python=3.5 icu=58 pandas
+$ conda create --prefix=<path/tacos_env> -c intel intelpython3_core python=3.6.* icu=58 pandas scipy numpy joblib matplotlib
 $ conda activate <path/tacos_env>
-$ conda install -c conda-forge python-igraph
+$ conda install -c conda-forge python-igraph tqdm pycairo
+$ conda install -c bioconda gffutils
 ```
 
 Install Tacos:
 ```
+$ conda activate <path/tacos_env> # if not already activated
 $ git clone https://git.scicore.unibas.ch/TBRU/tacos.git
 $ cd tacos
-$ pip install .
+$ pip install -e .
 ```
 
 Run the application:
 ```
-$ conda activate <path/tacos_env>
+$ conda activate <path/tacos_env> # if not already activated
 $ tacos --help
 ```
 or
 ```
-$ <path/tacos_env>/bin/python -m tacos --help
+$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment
 ```
 
 To run the tests:
diff --git a/setup.py b/setup.py
index 5446bf48a29be4d244edc6bee1df6477b6a08ea1..2e5f3acded37d467d5f3da0b0c9cec2a1990cd7e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,23 @@
-from setuptools import setup
+from setuptools import setup, find_packages
 
 setup(
     name='tacos',
     author='Monica R. Ticlla',
-    version='0.1',
-    py_modules=['tacos'],
+    author_email='mticlla@gmail.com',
+    version='0.2',
+    packages=find_packages(include=['tacos']),
     install_requires=[
         'Click',
         'numpy',
         'pandas',
-        'python-igraph'
+        'python-igraph',
+        'tqdm',
+        'scipy',
+        'joblib',
     ],
     entry_points='''
         [console_scripts]
-        tacos=tacos:tacos
+        tacos=tacos.cli:tacos
     ''',
+    test_suite='tests',
 )
\ No newline at end of file
diff --git a/tacos/__init__.py b/tacos/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af130df4a5b5bff2c350a3f9c83f25a488304de3
--- /dev/null
+++ b/tacos/__init__.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+"""Top-level package for pathways."""
+
+__author__ = """Monica R. Ticlla Ccenhua"""
+__email__ = 'mticlla@gmail.com'
+__version__ = '0.1.0'
diff --git a/tacos/alignment_utils.py b/tacos/alignment_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d491532c8faabc71f186c3b8aa81bc9d116cef1
--- /dev/null
+++ b/tacos/alignment_utils.py
@@ -0,0 +1,77 @@
+"""
+    tacos.alignment_utils
+
+    Library for reading or re-formatting a genomic alignment table.
+
+    :copyright: (c) 2019 by Monica R. Ticlla
+    :license: MIT, see LICENSE for more details.
+
+    This module provides multiple functions for reading a genomic alignment table of the following format:
+        +-----------+--------+-----------+----------+-----------+-----+
+        |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
+        +-----------+--------+-----------+----------+-----------+-----+
+    Where:
+        - Position: int
+            Corresponds to a genomic position
+        - Gene: string
+            Name of Gene
+        - Reference: character
+            The allele for the corresponding position in the reference genome
+        - GENOME_1: character
+            The allele for the corresponding position in genome GENOME_1
+
+    Available functions:
+    - import_alignment: Reads a genomic alignment table into a numeric table.
+"""
+import numpy as np
+import csv
+
+def import_alignment(data_file, sep='\t'):
+    """Reads a genomic alignment table into a numeric table.
+
+    This function assigns numeric codes to each nucleotide (including IUPAC's ambiguous nucleotides) and returns a
+    positions-by-genomes table. Numeric codes are assigned as in the following dictionary:
+        {'A':1, 'G':2, 'C':3, 'T':4, ['Y','R','W','S','K','M','D','V','H','B','X','N']:-1, and ['.','-']:0}
+
+    Args:
+        data_file: str
+            A valid string path to a genomic alignment table.
+        sep: str, default '\t'
+            Delimiter to use.
+    Returns:
+         An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions)
+         and two lists. The first list maps the row indices of the numpy.ndarray to the corresponding genomic positions.
+         The second list maps the column indices of the numpy.ndarray to the corresponding genomes. In the array,
+         allele Values (e.g. 'A', 'T', 'C', 'G') were converted to numerical values (e.g. 1,2,3,4).
+
+    """
+    # IUPAC's ambiguous nucleotides
+    nucleotides_letters = ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+    # Numeric codes for each nucleotide
+    # Ambiguous nucleotides are coded as missing values and receive a -1 value
+    # nucleotide_numeric = list(range(1,15)) + [-1,-1,0,0]
+    nucleotide_numeric = [1, 2, 3, 4] + [-1] * 12 + [0, 0]
+    # Dictionary to map nucleotides letters to corresponding numeric equivalents
+    iupac_nucleotide_code_mapper = dict(zip(nucleotides_letters, nucleotide_numeric))
+
+    records_header = list()
+    # positions_gene = dict()
+    # positions_ref_allele = dict()
+    positions = []
+    positions_genomes_alleles = []
+    alignment_table_reader = csv.reader(open(data_file), delimiter=sep)
+
+    for ix, row in enumerate(alignment_table_reader):
+        if (ix == 0):
+            records_header = row
+        else:
+            position = row[0]
+            # positions_gene[position] = row[1]
+            # positions_ref_allele[position] = row[2]
+            positions.append(position)
+            alleles, inv_ix = np.unique(row[3:], return_inverse=True)
+            alleles_numeric = np.array([iupac_nucleotide_code_mapper[key] for key in alleles], dtype='int8')
+            positions_genomes_alleles.append(alleles_numeric[inv_ix])
+    genomes = records_header[3:]
+    positions_genomes_alleles = np.asarray(positions_genomes_alleles, dtype='int8')
+    return (genomes, positions, positions_genomes_alleles.T)
diff --git a/tacos/cli.py b/tacos/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..19df2cc603c5de2903b8543483e7d366eff07a53
--- /dev/null
+++ b/tacos/cli.py
@@ -0,0 +1,75 @@
+import click
+import numpy as np
+import itertools
+from tacos.alignment_utils import import_alignment
+from tacos.distance import hamming_pool
+
+@click.group()
+def tacos():
+    pass
+
+@tacos.command()
+@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
+@click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values')
+@click.option('--n_jobs', 'n_jobs', default=1, show_default=True, help='specify number of processes for parallel computing')
+@click.option('--type', default='0', show_default=True, type=click.Choice(['0','1','2']),
+              help='either 1 (writes the lower triangle of the matrix), or 0 (writes the upper triangle) or 2 (writes both parts)')
+@click.argument('INPUT_', required=True, type=click.Path(exists=True))
+@click.argument('OUTPUT', required=True, type=click.Path())
+def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
+    '''Computes the genetic distance between each genome pair in INPUT.
+
+    Distances are measured as the number of substitutions needed to convert genome g_j to
+    genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored.
+
+    INPUT is the path to the file with variable positions (rows) and genomes (columns) of
+    the following format:
+        +-----------+--------+-----------+----------+-----------+-----+
+        |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
+        +-----------+--------+-----------+----------+-----------+-----+
+    Where:
+        - Position: int
+            Corresponds to a genomic position
+        - Gene: string
+            Name of Gene
+        - Reference: str
+            The allele for the corresponding position in the reference genome.
+            Values are any of IUPAC's encoding:
+                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+        - GENOME_1: str
+            The allele for the corresponding position in genome GENOME_1
+            Values are any of IUPAC's encoding:
+                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+
+    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values.
+    '''
+    INPUT_file = input_
+    genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t')
+    nr_genomes = len(genomes)
+    genome_pairs = list(itertools.combinations(genomes, 2))
+
+    distance_matrix = hamming_pool(data_array=genomes_positions,
+                                   missing=-1,
+                                   gap=0,
+                                   ignore_gap=ignore_gap,
+                                   adjust=adjust,
+                                   form=2,
+                                   n_jobs=n_jobs)
+    if type == '0':
+        distance_matrix = np.triu(distance_matrix)
+    elif type == '1':
+        distance_matrix = np.tril(distance_matrix)
+    else:
+        pass
+
+    with click.open_file(output, 'w') as f:
+        f.write('\t'.join(['']+genomes))
+        f.write('\n')
+        for row_ix in range(nr_genomes):
+            f.write(genomes[row_ix]+'\t')
+            f.write('\t'.join(map(str,distance_matrix[row_ix,:])))
+            f.write('\n')
+
+    click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes))
+if __name__ == '__main__':
+    tacos()
\ No newline at end of file
diff --git a/tacos/distance.py b/tacos/distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b19ea6272fb63de7ef85efbc96760ff13f41579
--- /dev/null
+++ b/tacos/distance.py
@@ -0,0 +1,247 @@
+"""
+    tacos.distance
+
+    Library for computing genomic pairwise distance as nucleotide substitutions.
+
+    :copyright: (c) 2019 by Monica R. Ticlla
+    :license: MIT, see LICENSE for more details.
+
+    This module provides multiple functions to ...
+
+    Available functions:
+    - ...
+
+"""
+import numpy as np
+from scipy.spatial.distance import pdist, squareform
+import itertools
+import multiprocessing as mp
+from multiprocessing import get_context
+from contextlib import closing
+from functools import partial
+from joblib import Parallel, delayed
+import ctypes
+import tqdm
+
+def genetic_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False):
+    """Return the Hamming distance between two given sequence of characters or two discrete numerical vectors.
+    Sequences must have equal length.
+
+    This function computes the genetic distance between two sequences of nucleotides represented as characters (e.g A,
+    T, C, G) or as discrete numerical values (e.g 1, 2, 3, 4)
+
+    Args:
+        seq1: numpy.ndarray (faster), pandas.Series
+            Sequence of characters or discrete numerical vectors.
+        seq1: numpy.ndarray (faster), pandas.Series
+            Sequence of characters or discrete numerical vectors.
+        missing: Character or value (int) representing missing value in the sequence.
+        gap: Character or value (int) representing gap/deletion value in the sequence.
+        ignore_gap: boolean, default False
+            Flag to consider gaps as missing values instead of coding them as a 5th character
+        adjust: boolean, default False
+            Flag specifying adjustment of distance to account for missing values.
+    Returns:
+        Int, Float
+        Absolute sequence difference between seq1 and seq2, after excluding positions with missing values.
+
+    """
+    # Raises exception if vectors do not have equal length
+    try:
+        # get positions excluding missing values
+        if ignore_gap:
+            seq1_keep = ~(np.in1d(seq1,[missing,gap]))
+            seq2_keep = ~(np.in1d(seq2,[missing,gap]))
+            #seq1_keep = (seq1 != missing) & (seq1 != gap)
+            #seq2_keep = (seq2 != missing) & (seq2 != gap)
+        else:
+            seq1_keep = (seq1 != missing)
+            seq2_keep = (seq2 != missing)
+        to_keep = (seq1_keep & seq2_keep)
+    except ValueError:
+        print("seq1 and seq2 must have equal lengths.")
+    else:
+        # drop positions with missing values and
+        # compute Hamming distance
+        # np.sum() instead of sum() improves speed considerably!
+        new_seq1 = seq1[to_keep]
+        new_seq2 = seq2[to_keep]
+        pair_dist = np.sum(new_seq1 != new_seq2)
+        # Adjust distance to account for missing values
+        if adjust:
+            seq1_effective_len = np.sum(seq1_keep)
+            seq2_effective_len = np.sum(seq2_keep)
+            pair_dist = np.around(len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len)), decimals=2)
+        #else:
+            #pair_dist = np.sum(new_seq1 != new_seq2)
+
+        return (pair_dist)
+
+def hamming_pairs(genome_pairs, data_array, missing='X', gap='-', ignore_gap=False, adjust=False):
+    """
+    Args:
+        data_array: numpy.ndarray
+            An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions).
+            Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended
+            to provide a ndarray of numerical vectors.
+    """
+    try:
+        pair_distances = np.asarray([genetic_hamming(data_array[g1,:], data_array[g2,:],
+                                                     missing, gap, ignore_gap, adjust)
+                                     for g1,g2 in tqdm.tqdm(genome_pairs)])
+    except IndexError:
+        print("Provided index is out of range in data_array.")
+    else:
+        return(pair_distances)
+
+def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1):
+    """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
+    n-dimensional space (genomic positions).
+
+    This function uses Scipy's pdist function to apply tacos.distance.genetic_hamming to every pair of genomic sequences in
+    data_array. It runs fast but it's not using multiprocessing.
+
+    Args:
+        data_array: numpy.ndarray
+            An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions).
+            Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended
+            to provide a numpy.ndarray of numerical vectors.
+        missing: str, int, default 'X'
+            Value representing missing value in the genomic sequence.
+        gap: str, int, default '-'
+            Value representing gap/deletion value in the sequence.
+        ignore_gap: boolean, default False
+            Flag to consider gaps as missing values instead of coding them as a 5th character
+        adjust: boolean, default False
+            Flag specifying adjustment of genetic Hamming distance to account for missing values.
+        form:int, default 1
+            Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or
+            square-form distance matrix (2)
+
+    Returns:
+        numpy.ndarray
+        A condensed distance matrix (vector-form) or a redundant square-form distance matrix.
+
+    """
+    try:
+        out_dist = pdist(data_array, genetic_hamming, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
+    except ValueError as e:
+        print(e)
+    else:
+        if form != 1:
+            return(squareform(out_dist))
+        else:
+            return(out_dist)
+
+
+def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1):
+    """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
+    n-dimensional space (genomic positions).
+
+    This function uses multiprocessing.Pool to apply tacos.distance.genetic_hamming to every pair of genomic sequences in
+    data_array.
+
+    Args:
+        data_array: numpy.ndarray
+            An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions).
+            Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended
+            to provide a numpy.ndarray of numerical vectors.
+        missing: str, int, default 'X'
+            Value representing missing value in the genomic sequence.
+        gap: str, int, default '-'
+            Value representing gap/deletion value in the sequence.
+        ignore_gap: boolean, default False
+            Flag to consider gaps as missing values instead of coding them as a 5th character
+        adjust: boolean, default False
+            Flag specifying adjustment of genetic Hamming distance to account for missing values.
+        form:int, default 1
+            Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or
+            square-form distance matrix (2).
+        n_jobs: int
+            The number of concurrent processes.
+
+    Returns:
+        numpy.ndarray
+        A condensed distance matrix (vector-form) or a redundant square-form distance matrix.
+    """
+    if n_jobs==1:
+        return(hamming_pdist(data_array=data_array, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust,
+                             form=form))
+    else:
+        nr_genomes = data_array.shape[0]
+        genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2))
+
+        # Here we copy the data_array to a shared memory array
+        # Using the shared memory array gained 5 fold more speed
+        data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False)
+        data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8)
+        data_array_shm_b.shape = data_array.shape
+        data_array_shm_b[:] = data_array
+
+
+        # Start a multiprocessing pool
+        genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs)
+        with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
+            print('starting pool with {} processes'.format(n_jobs))
+            get_distance_partial = partial(hamming_pairs,
+                                           data_array=data_array_shm_b,
+                                           missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
+
+            distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1)
+        pool.close()
+        pool.join()
+
+        pair_distances = [pair_dist for item in distance_mp for pair_dist in item]
+        if form == 1:
+            return(pair_distances)
+        else:
+            return (squareform(pair_distances))
+
+def hamming_joblib(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=2):
+    """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
+    n-dimensional space (genomic positions).
+
+    This function uses joblib.Parallel to apply tacos.distance.genetic_hamming to every pair of genomic sequences in
+    data_array.
+
+    Args:
+        data_array: numpy.ndarray
+            An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions).
+            Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended
+            to provide a numpy.ndarray of numerical vectors.
+        missing: str, int, default 'X'
+            Value representing missing value in the genomic sequence.
+        gap: str, int, default '-'
+            Value representing gap/deletion value in the sequence.
+        ignore_gap: boolean, default False
+            Flag to consider gaps as missing values instead of coding them as a 5th character
+        adjust: boolean, default False
+            Flag specifying adjustment of genetic Hamming distance to account for missing values.
+        form:int, default 1
+            Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or
+            square-form distance matrix (2).
+        n_jobs: int
+            The number of parallel jobs.
+
+    Returns:
+        numpy.ndarray
+        A condensed distance matrix (vector-form) or a redundant square-form distance matrix.
+
+    """
+    try:
+        nr_genomes = data_array.shape[0]
+        genome_ix_pairs = list(itertools.combinations(range(0, nr_genomes), 2))
+        out_dist = np.asarray(Parallel(n_jobs=n_jobs, prefer='threads')(delayed(genetic_hamming)(data_array[g1_ix, :],
+                                                                                                  data_array[g2_ix,:],
+                                                                                                 missing = missing,
+                                                                                                 gap = gap,
+                                                                                                 ignore_gap=ignore_gap,
+                                                                                                 adjust=adjust)
+                                                                        for g1_ix, g2_ix in tqdm.tqdm(genome_ix_pairs)))
+    except ValueError as e:
+        print(e)
+    else:
+        if form != 1:
+            return(squareform(out_dist))
+        else:
+            return(out_dist)
\ No newline at end of file