Skip to content
Snippets Groups Projects
genome_process.sh 2.36 KiB
#!/bin/bash

#########################################################
### Paula Iborra, Biozentrum, University of Basel     ###
### paula.iborra@alumni.esci.upf.edu                  ###
### JAN-2020                                          ###
#########################################################

#####################
###  DESCRIPTION  ###
#####################

# Download and process genome sequences fasta files.

####################
###  PARAMETERS  ###
####################

# Prefix for filenames
output_dir="$1"
log_dir="$2"   

# # Paths (DO NOT CHANGE!)
root="$PWD"
#root="$(cd "$(dirname "$0" )" && pwd)"

resDir="${root}/${output_dir}"
rawDir="${resDir}/raw"
logDir="${root}/${log_dir}"

# URLs
# ------
# All URLs variables represent Bash arrays, so that multiple URLs can be provided; in that case, 
# files are concatenated after download
genomeSeqURLs="$3"   #Modified by Iborra P

########################
###  PRE-REQUISITES  ###
########################

# Shell options
set -e
set -u
set -o pipefail

# Create directories
mkdir --parents "$resDir"
mkdir --parents "$rawDir"
mkdir --parents "$logDir"

# Create log file
logFile="${logDir}"
rm -fr "$logFile"; touch "$logFile"
>&2 echo "Log written to '$logFile'..."

##############
###  MAIN  ###
##############

## GET & FILTER GENE ANNOTATIONS

# Get genome sequences fasta files
echo "Downloading genome sequences files..." >> "$logFile"
# wget -i "${genomeSeqURLs}" --output-document "${rawDir}/${fileNamePrefix}.genome.fa.gz"
# genomeSeq="${resDir}/${fileNamePrefix}.genome.fa.gz"

for url in "${genomeSeqURLs[@]}"; do
    wget "$url" --output-document "${rawDir}/$(basename "$url")" &> /dev/null
done

# Concatenate genome sequences fasta files
echo "Concatenating genome sequences files..." >> "$logFile"
genomeSeq="${resDir}/genome.fa.gz"
for url in "${genomeSeqURLs[@]}"; do
    cat "${rawDir}/$(basename "$url")" >> "$genomeSeq"
done

# Trim genome sequences IDs
echo "Triming genome sequenes IDs..." >> "$logFile"
genomeSeqTrim="${resDir}/genome.processed.fa"
gunzip "$genomeSeq"
genomeSeq="${resDir}/genome.fa"
awk '{if ($1 ~ /^>/) {print $1} else {print $0}}' "$genomeSeq" > "$genomeSeqTrim"
rm "${resDir}/genome.fa"

#############
###  END  ###
#############

echo "Original data in: $rawDir" >> "$logFile"
echo "Processed data in: $resDir" >> "$logFile"
echo "Done. No errors." >> "$logFile"
>&2 echo "Done. No errors."