Mercurial > repos > trinity_ctat > ctat_centrifuge_index_data_manager

--- a/data_manager/add_ctat_centrifuge_index.py	Fri Apr 27 07:24:29 2018 -0400
+++ b/data_manager/add_ctat_centrifuge_index.py	Tue May 01 09:31:18 2018 -0400
@@ -14,6 +14,9 @@
 #import urllib
 import subprocess

+# The following is used to generate a unique_id value
+from datetime import *
+
 # Remove the following line when testing without galaxy package:
 from galaxy.util.json import to_json_string
 # Am not using the following:
@@ -30,8 +33,10 @@

 _CTAT_CentrifugeIndexPage_URL = 'https://ccb.jhu.edu/software/centrifuge/'
 _CTAT_CentrifugeDownload_URL = 'ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz'
+_CTAT_CentrifugeIndexTableName = 'ctat_centrifuge_indexes'
 _CTAT_CentrifugeDir_Name = 'p_compressed+h+v'
 _CTAT_Centrifuge_DisplayNamePrefix = 'CTAT_CentrifugeIndex_'
+_CentrifugeIndexFileExtension = 'cf'
 _NumBytesNeededForIndex = 7400130287 # 6.9 GB
 #_DownloadFileSize = 5790678746 # 5.4 Gigabytes.
 _Download_TestFile = 'write_testfile.txt'
@@ -58,36 +63,36 @@
 # End of class FileListParser

 def get_ctat_centrifuge_index_locations():
+    # For dynamic options need to return an interable with contents that are tuples with 3 items.
+    # Item one is a string that is the display name put into the option list.
+    # Item two is the value that is put into the parameter associated with the option list.
+    # Item three is a True or False value, indicating whether the item is selected.
+    options = []
     # open the url and retrieve the filenames of the files in the directory.
     resource = urllib2.urlopen(_CTAT_CentrifugeIndexPage_URL)
     theHTML = resource.read()
     filelist_parser = FileListParser()
     filelist_parser.feed(theHTML)
-    # return a tuple of the filenames
-    # return tuple(filelist_parser.filenames)
-    # For now, I am printing the list, just so I can see what was returned,
     # This is what was returned on 2018-04-23
     # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz
     # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz
     # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz
     # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz
+    # Which could be hard coded:
+    # vals.append(("p_compressed+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz", True))
+    # vals.append(("p+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz", False))
+    # vals.append(("nt_2018_3_3", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz", False))
+    # vals.append(("p_compressed_2018_4_15", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz", False))
     # but only returning the one we want, which for now is assumed to be present.
+    # For now, I am printing the list, just so I can see what was returned,
     print "FYI: The URL's that were found on Centrifuge's page are:"
     print "\t" + "\n\t".join(filelist_parser.filenames)
-    # Instead of sending back the list of found URL's, send back the one URL we want.
-    # But the selection interface chokes on a single value, so send two of the same item.
-    #vals=set()
-    #vals.add("one")
-    #vals.add("two")
-    # vals.add(_CTAT_CentrifugeDownload_URL)
-    # vals = ("one", "onemore", "/path/to/one")
-    vals = [dict(name="one", value="onemore", path="/path/to/one")]
+    # For now instead of sending back the list of found URL's, send back the one URL we want.
+    # Currently, only one of the options is supported.
+    vals.append((_CTAT_CentrifugeDir_Name, _CTAT_CentrifugeDownload_URL, True))
     print "The items in vals are:"
     print str(vals)
-    return vals # works kind of, but nothing is printed in the pull down list.
-    # return tuple(vals)
-    # return tuple(filelist_parser.filenames)
-    # return ("one","two") # only prints 'o' and 't'.
+    return vals

 # The following was used by the example program to get input parameters through the json.
 # Just leaving here for reference.
@@ -186,7 +191,8 @@
         # We want to transfer and untar the file without storing the tar file, because that
         # adds all that much more space to the needed amount of free space on the disk.
         # Use subprocess to pipe the output of curl into tar.
-        command = "curl {:s} | tar -xzvf - -C {:s}".format(src_location, cannonical_destination)
+        # Make curl silent so progress is not printed to stderr.
+        command = "curl --silent {:s} | tar -xzf - -C {:s}".format(src_location, cannonical_destination)
         try: # to send the command that downloads and extracts the file.
             command_output = subprocess.check_output(command, shell=True)
             # FIX - not sure check_output is what we want to use. If we want to have an error raised on
@@ -232,7 +238,7 @@

 def main():
     #Parse Command Line
-    print "At start before parsing arguments."
+    # print "At start before parsing arguments."
     parser = argparse.ArgumentParser()
     parser.add_argument('-d', '--download_location', default="", \
         help='This is the download location of the centrifuge index.')
@@ -255,12 +261,13 @@
     # target_directory = params['output_data'][0]['extra_files_path']
     # os.mkdir(target_directory)

-    print "Arguments are parsed."
-    print "download location is {:s}".format(str(args.download_location))
+    # print "Arguments are parsed."
+    print "\ndownload_location is {:s}".format(str(args.download_location))
     print "display_name is {:s}".format(str(args.display_name))
-    print "destination path is {:s}".format(str(args.destination_path))
+    print "destination_path is {:s}\n".format(str(args.destination_path))
     root_index_dirname = None
     # FIX - Prob don't need index_was_downloaded. Not doing anything with it.
+    # But it indicates success downloading the index, so maybe should be checking it.
     index_was_downloaded = False
     if (args.download_location != ""):
         index_directory, root_index_dirname, index_was_downloaded = \
@@ -268,14 +275,45 @@
                            destination=args.destination_path, \
                            force_download=args.force_download)
     else:
-        index_directory = args.destination_path
-        if not os.path.exists(index_directory):
-            raise ValueError("Cannot find the Centrifuge Index. " + \
+        cannonical_destination = os.path.realpath(args.destination_path)
+        if not os.path.exists(cannonical_destination):
+            raise ValueError("Cannot find the Centrifuge Index.\n" + \
                 "The directory does not exist:\n\t{:s}".format(index_directory))
-        # FIX - Check if there is an actual Centrifuge Index in there.
+        # If args.destination_path is a directory containing
+        # a subdirectory that contains the index files,
+        # then we need to set the index_directory to be that subdirectory.
+        files_in_destination_path = os.listdir(cannonical_destination)
+        if (len(files_in_destination_path) == 1):
+            path_to_file = "{:s}/{:s}".format(cannonical_destination, files_in_destination_path[0])
+            if os.path.isdir(path_to_file):
+                index_directory = path_to_file
+            else:
+                index_directory = cannonical_destination
+        else:
+            index_directory = cannonical_destination
+        # Get the root_index_dirname of the index from the index_directory name.
+        root_index_dirname = index_directory.split("/")[-1].split(".")[0]

+    # Check if there is an actual Centrifuge Index file in the index_directory.
     print "\nThe location of the Centrifuge Index is {:s}.\n".format(index_directory)
+    files_in_index_directory = set(os.listdir(index_directory))
+    index_file_found = False
+    index_file_path = index_directory
+    for filename in files_in_index_directory:
+        # The current index is split into 3 files.
+        # filenames are in the form: index_root_name.#.cf,
+        # where # is a numeral (1, 2, or 3)
+        # indicating the order of the files.
+        if filename.split(".")[-1] == _CentrifugeIndexFileExtension:
+            index_file_found = True
+            # The centrifuge program wants the root name of the files to be final part of the path.
+            index_file_path = "{:s}/{:s}".format(index_directory, filename.split(".")[0])
+    if not index_file_found:
+        raise ValueError("Cannot find any Centrifuge Index files.\n" + \
+            "The contents of the directory {:s} are:\n\t".format(index_directory) + \
+            "\n\t".join(files_in_index_directory))

+    # Set the display_name
     if (args.display_name is None) or (args.display_name == ""):
         # Use the root_index_dirname.
         if (root_index_dirname != None) and (root_index_dirname != ""):
@@ -284,20 +322,25 @@
             display_name = _CTAT_Centrifuge_DisplayNamePrefix + _CTAT_CentrifugeDir_Name
             print "WARNING: Did not set the display name. Using the default: {:s}".format(display_name_value)
     else:
-        display_name = args.display_name
+        display_name = _CTAT_Centrifuge_DisplayNamePrefix + args.display_name
+    display_name = display_name.replace(" ","_")
+
+    # Set the unique_id
+    datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
     if (root_index_dirname != None) and (root_index_dirname != ""):
-        unique_id = root_index_dirname
+        unique_id = root_index_dirname + datetime_stamp
     else:
-        unique_id = _CTAT_CentrifugeDir_Name
+        unique_id = _CTAT_CentrifugeDir_Name + datetime_stamp
+
     print "The Index's display_name will be set to: {:s}\n".format(display_name)
     print "Its unique_id will be set to: {:s}\n".format(unique_id)
-    print "Its dir_path will be set to: {:s}\n".format(index_directory)
+    print "Its dir_path will be set to: {:s}\n".format(index_file_path)

     data_manager_dict = {}
     data_manager_dict['data_tables'] = {}
-    data_manager_dict['data_tables']['ctat_centrifuge_index'] = []
-    data_table_entry = dict(value=unique_id, name=display_name, path=index_directory)
-    data_manager_dict['data_tables']['ctat_centrifuge_index'].append(data_table_entry)
+    data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName] = []
+    data_table_entry = dict(value=unique_id, name=display_name, path=index_file_path)
+    data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName].append(data_table_entry)

     # Temporarily the output file's dictionary is written for debugging:
     print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
--- a/data_manager/add_ctat_centrifuge_index.xml	Fri Apr 27 07:24:29 2018 -0400
+++ b/data_manager/add_ctat_centrifuge_index.xml	Tue May 01 09:31:18 2018 -0400
@@ -43,13 +43,18 @@
                      However, I have not been able to figure out how to send information back correctly
                      from the function and there is no documentation that I have found showing how to do it.
                 <param name="filename" type="select" label="Select File" display="radio"
-                    dynamic_options="get_ctat_centrifuge_index_locations()" help="Select a Centrifuge Index to Download." />
-                -->
+                    dynamic_options="get_ctat_centrifuge_index_locations()"
+                    help="Select a Centrifuge Index to Download." />
+                Hard coded version.
                 <param name="filename" type="text" value="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz">
                     <option value="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz">
                         p_compressed+h+v
                     </option>
                 </param>
+                -->
+                <param name="filename" type="select" label="Select File"
+                    dynamic_options="get_ctat_centrifuge_index_locations()"
+                    help="Select a Centrifuge Index to Download." />
                 <param name="force_download" type="boolean" checked="false" label="Force New Download?" />
             </when>
         </conditional>