#!/usr/bin/env python
#
# Data manager for downloading Plant Tribes scaffolds data.
import argparse
import json
import os
import shutil
import sys
import tarfile
import urllib2
import zipfile


DEFAULT_DATA_TABLE_NAMES = ["plant_tribes_scaffolds"]


def add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
    return data_manager_dict


def files_from_file_paths(file_paths):
    """
    Given a list of file system paths, return a list of
    absolute paths for all files and directories within
    those paths.
    """
    # Collect files.
    files = []
    for file_path in file_paths:
        file_path = os.path.abspath(file_path)
        if os.path.isfile(file_path):
            # Store full path for each file.
            files.append(file_path)
        elif os.path.isdir(file_path):
            # Descend into directory and collect the files
            for f in os.listdir(file_path):
                files.extend(files_from_file_paths(os.path.join(file_path, f)))
    return files


def import_from_server(data_manager_dict, target_directory, file_system_paths, description, link_to_data=False, data_table_names=DEFAULT_DATA_TABLE_NAMES):
    """
    Creates references to the specified file(s) on the Galaxy
    server in the data table.
    """
    # Remove escapes for '\n' and '\r' that might have been inserted by Galaxy.
    file_paths = file_system_paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
    files = files_from_file_paths(file_paths)
    for f in files:
        source_file = os.path.basename(f)
        target_file = os.path.join(target_directory, source_file)
        entry_name = source_file
        if link_to_data:
            os.symlink(f, target_file)
        else:
            shutil.copyfile(f, target_file)
        for data_table_name in data_table_names:
            data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=source_file, name=entry_name, path=f, description=description))
    return data_manager_dict


def make_directory(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)


def remove_directory(dir):
    if os.path.exists(dir):
        shutil.rmtree(dir)


def url_download(data_manager_dict, target_directory, url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES):
    work_directory = os.path.abspath(os.path.join(os.getcwd(), 'scaffolds'))
    make_directory(work_directory)
    file_path = os.path.join(work_directory, os.path.basename(url))
    src = None
    dst = None
    try:
        req = urllib2.Request(url)
        src = urllib2.urlopen(req)
        dst = open(file_path, 'wb')
        while True:
            chunk = src.read(2**10)
            if chunk:
                dst.write(chunk)
            else:
                break
    except Exception, e:
        print >>sys.stderr, str(e)
    finally:
        if src:
            src.close()
        if dst:
            dst.close()
    if tarfile.is_tarfile(file_path):
        fh = tarfile.open(file_path, 'r:*')
    elif zipfile.is_zipfile(file_path):
        fh = zipfile.ZipFile(file_path, 'r')
    else:
        return
    fh.extractall(work_directory)
    os.remove(file_path)
    # Move the scaffolds data files into defined output directory.
    for filename in os.listdir(work_directory):
        shutil.move(os.path.join(work_directory, filename), target_directory)
    remove_directory(work_directory)
    # Populate the data table, there should be a single entry in target_directory.
    for file_path in os.listdir(target_directory):
        entry_name = "%s" % os.path.basename(file_path)
        for data_table_name in data_table_names:
            data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=entry_name, name=entry_name, path=file_path, description=description))
    return data_manager_dict


parser = argparse.ArgumentParser()
parser.add_argument('--data_source', dest='data_source', help='Data source')
parser.add_argument('--description', dest='description', default=None, help='Description')
parser.add_argument('--create_symlink', dest='create_symlink', default=None, help='Link files instead of copying')
parser.add_argument('--file_system_paths', dest='file_system_paths', default=None, help='File system paths')
parser.add_argument('--name', dest='name', help='Data table entry unique ID')
parser.add_argument('--out_file', dest='out_file', help='JSON output file')
parser.add_argument('--web_url', dest='web_url', default=None, help='Web URL')

args = parser.parse_args()

# Some magic happens with tools of type "manage_data" in that the output
# file magically contains some JSON data that wllos us to define the target
# directory.
params = json.loads(open(args.out_file).read())
target_directory = params['output_data'][0]['extra_files_path']
make_directory(target_directory)

data_table_names = DEFAULT_DATA_TABLE_NAMES
if args.description is None:
    description = ''
else:
    description = args.description.strip()

# Initialize the data table.
data_manager_dict = {}
data_manager_dict['value'] = args.name.lower()
data_manager_dict['name'] = args.name
data_manager_dict['path'] = '.'
data_manager_dict['description'] = description

# Get the scaffolds data.
if args.data_source == 'web_url':
    data_manager_dict = url_download(data_manager_dict, target_directory, args.web_url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES)
else:
    data_manager_dict = import_from_server(data_manager_dict, target_directory, args.file_system_paths, description, create_symlink=args.create_symlink, data_table_names=DEFAULT_DATA_TABLE_NAMES)

# Write the JSON output dataset.
file(args.out_file, 'w').write(json.dumps(data_manager_dict))
