#!/bin/sh

source=ftp://ftp.ncbi.nih.gov/genomes
wget="wget --passive-ftp"
if [ -d /Library/Caches/Genomes ]; then
    repository=/Library/Caches/Genomes
fi
if [ -d /var/cache/genomes ]; then
    repository=/var/cache/genomes
fi
if [ -z "$repository" -o ! -d "$repository" ]; then
    echo "Can't find genome repository."
    exit 1
fi

getdir() {
    rm -f index.html*
    $wget $1/ > /dev/null 2>&1
    if [ -f index.html ]; then
        grep 'Directory.*href=' index.html | \
            sed -e 's/.*href="//' -e 's/\/*">.*//' > .dirs
    fi
    [ -f index.html ]
}

genus=$1
species=$2
strain=$3
if [ -z "$genus" -o -z "$species" ]; then
    echo "Usage: $0 <genus> <species> [<strain>]"
    exit 1
fi

organism=${genus}_${species}
if [ -n "$strain" ]; then
    organism=${organism}_$strain
fi
abbrev=`echo $organism | sed -e 's/\(.\)[^_]*/\1/'`

if getdir $source/$organism; then
    url=$source/$organism
elif getdir $source/$abbrev; then
    url=$source/$abbrev
else
    getdir $source
    for subdir in `grep -v _ .dirs`; do
        echo Checking $subdir.
        if getdir $subdir/$organism; then
            url=$subdir/$organism
            break
        elif getdir $subdir/$abbrev; then
            url=$subdir/$abbrev
            break
        fi
    done
fi

if [ -z "$url" ]; then
    echo Could not find a genome for $organism.
    exit 1
fi
    
cd $repository
mkdir -p $organism
cd $organism
echo
echo ======================================================================
echo Using $url.
echo ======================================================================
echo
$wget -c -N --glob=on $url/*.gbk{,.gz}
getdir $url
for dir in `cat .dirs`; do
    echo
    echo ======================================================================
    echo Scanning $dir.
    echo ======================================================================
    echo
    $wget -c -N --glob=on $dir/*.gbk{,.gz}
done
rm -f index.html* .dirs
