Build branch main with version main (65dd41d)

Build pipeline: viash-hub.htrnaseq.main-vhms8 Source commit: 65dd41d8b1 Source message: Optimize spawning of processes
2024-11-05 17:26:35 +00:00
parent b8abf8c490
commit f2ff92c6ac
127 changed files with 29560 additions and 1004 deletions
--- a/target/nextflow/stats/combine_star_logs/main.nf
+++ b/target/nextflow/stats/combine_star_logs/main.nf
@@ -1,8 +1,8 @@
 // combine_star_logs main
 // 
-// This wrapper script is auto-generated by viash 0.9.0-RC7 and is thus a
-// derivative work thereof. This software comes with ABSOLUTELY NO WARRANTY from
-// Data Intuitive.
+// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative
+// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
+// Intuitive.
 // 
 // The component may contain files which fall under a different license. The
 // authors of this component should specify the license in the header of such
@@ -1728,7 +1728,9 @@ def publishStates(Map args) {

          def yamlFilename = yamlTemplate_
            .replaceAll('\\$id', id_)
+            .replaceAll('\\$\\{id\\}', id_)
            .replaceAll('\\$key', key_)
+            .replaceAll('\\$\\{key\\}', key_)

            // TODO: do the pathnames in state_ match up with the outputFilenames_?

@@ -1799,7 +1801,9 @@ def publishStatesByConfig(Map args) {
          def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml'
          def yamlFilename = yamlTemplate
            .replaceAll('\\$id', id_)
+            .replaceAll('\\$\\{id\\}', id_)
            .replaceAll('\\$key', key_)
+            .replaceAll('\\$\\{key\\}', key_)
          def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent()

          // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where
@@ -1841,7 +1845,9 @@ def publishStatesByConfig(Map args) {
                // instantiate the template
                def filename = filenameTemplate
                  .replaceAll('\\$id', id_)
+                  .replaceAll('\\$\\{id\\}', id_)
                  .replaceAll('\\$key', key_)
+                  .replaceAll('\\$\\{key\\}', key_)
                if (par.multiple) {
                  // if the parameter is multiple: true, the filename
                  // should contain a wildcard '*' that is replaced with
@@ -2809,7 +2815,7 @@ meta = [
          "type" : "string",
          "name" : "--barcodes",
          "description" : "Barcodes responding to the respective log files.\n",
-          "required" : false,
+          "required" : true,
          "direction" : "input",
          "multiple" : true,
          "multiple_sep" : ";"
@@ -2823,7 +2829,7 @@ meta = [
          ],
          "must_exist" : true,
          "create_parent" : true,
-          "required" : false,
+          "required" : true,
          "direction" : "input",
          "multiple" : true,
          "multiple_sep" : ";"
@@ -2837,7 +2843,7 @@ meta = [
          ],
          "must_exist" : true,
          "create_parent" : true,
-          "required" : false,
+          "required" : true,
          "direction" : "input",
          "multiple" : true,
          "multiple_sep" : ";"
@@ -2848,7 +2854,7 @@ meta = [
          "description" : "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n",
          "must_exist" : true,
          "create_parent" : true,
-          "required" : false,
+          "required" : true,
          "direction" : "input",
          "multiple" : true,
          "multiple_sep" : ";"
@@ -3026,15 +3032,23 @@ meta = [
    "runner" : "nextflow",
    "engine" : "docker|native",
    "output" : "target/nextflow/stats/combine_star_logs",
-    "viash_version" : "0.9.0-RC7",
-    "git_commit" : "cf9797232db1306bfd5696287928cababe317d99",
-    "git_remote" : "https://x-access-token:ghs_KjB7pWu8DQM3iFulLu7RI06qnt5K8S1A0eaE@github.com/viash-hub/htrnaseq"
+    "viash_version" : "0.9.0",
+    "git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
+    "git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
  },
  "package_config" : {
    "name" : "htrnaseq",
    "version" : "main",
    "description" : "High-throughput pipeline [WIP]\n",
-    "viash_version" : "0.9.0-RC7",
+    "info" : {
+      "test_resources" : [
+        {
+          "path" : "gs://viash-hub-test-data/htrnaseq/v1/",
+          "dest" : "resources_test"
+        }
+      ]
+    },
+    "viash_version" : "0.9.0",
    "source" : "src",
    "target" : "target",
    "config_mods" : [
@@ -3072,7 +3086,6 @@ tempscript=".viash_script.sh"
 cat > "$tempscript" << VIASHMAIN
 import logging
 import pandas as pd
-import numpy as np
 from itertools import batched, starmap

 ### VIASH START
@@ -3119,7 +3132,7 @@ logger.setLevel(logging.DEBUG)
 def handle_percentages(column_value):
    # TODO: handle this more gracefully
    if column_value:
-        return np.float64(column_value.strip('%'))
+        return column_value.strip('%')
    return column_value

 def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame:
@@ -3137,7 +3150,7 @@ def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame:
    logger.info("Reading summary log %s for barcode %s", summary_path, barcode)
    result = pd.read_table(summary_path, sep=",",
                           header=None, names=["Category", "Value"],
-                           index_col=0)
+                           index_col=0, dtype=pd.StringDtype())
    logger.info("Read %d row(s) and %d column(s) from summary file at %s",
                *result.shape, summary_path)
    return result
@@ -3146,9 +3159,14 @@ def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame:
 def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame:
    logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode)
    result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\\\\t",
+                           dtype={"geneID": pd.StringDtype(),
+                                  "Unstranded": pd.Int64Dtype(),
+                                  "posStrand": pd.Int64Dtype(),
+                                  "negStrand": pd.Int64Dtype()},
                           index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"])
    result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series
    df = pd.DataFrame({"Value": result.sum()})
+    df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise")
    df.index.name = "Category"
    logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s",
                *df.shape, read_per_gene_path)
@@ -3177,12 +3195,16 @@ def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame
                "\\\\n\\\\t".join(to_keep[~to_keep].index.to_list()))
    result = df.loc[to_keep]

+    # Replace % by pect, remove columns, use camel case and remove spaces
+    # You might be tempted to use .title() to make everything uppercase,
+    # but characters which are already uppercase should stay that way.
+    # (example: NumberOfUMIs and not NumberOfUmis)
    result.index = result.index.str.replace("%", "pect")\\\\
                    .str.replace(":", "")\\\\
                    .str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(), regex=True)\\\\
                    .str.replace(" ", "")
    result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads", 
-                            "UniquelyMappedReadsPect": "pctMappedReads"}, errors="raise")
+                            "UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise")
    logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). "
                "Found entries:\\\\n\\\\t%s", 
                barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list()))
@@ -3198,13 +3220,9 @@ def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame)
        "Reads Mapped to Genome: Unique",
        "Reads Mapped to Transcriptome: Unique Genes",
        "Reads in Cells Mapped to Unique Genes",
-        "Mean Reads per Cell",
        "Median UMI per Cell",
        "Median Genes per Cell",
-        "Q30 Bases in CB+UMI",
        "Reads Mapped to Genome: Unique+Multiple",
-        "Reads Mapped to Transcriptome: Unique+Multipe Genes",
-        "Fraction of Reads in Cells",
        "Median Reads per Cell",
        "Mean UMI per Cell",
        "Mean Genes per Cell",
@@ -3217,8 +3235,15 @@ def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame)
    result = df.loc[to_keep]
    result.index = result.index.str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(),
                                            regex=True).str.replace(" ", "")
-    result = result.rename({"UMIsInCells": "NumberOfUMIs", 
-                            "TotalGenesDetected": "NumberOfGenes"}, errors="raise")
+    to_rename = {"UMIsInCells": "NumberOfUMIs", 
+                 "TotalGenesDetected": "NumberOfGenes"}
+    try:
+        result = result.rename(to_rename, errors="raise")
+    except KeyError as e:
+        raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary "
+                       f"log for barcode {barcode}, but an entry was not found in the file. "
+                       "Make sure that you are using the correct version of STAR."
+                       f"Available entries: {", ".join(result.index.to_list())}") from e
    logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). "
                "Found entries:\\\\n\\\\t%s",
                barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list()))
@@ -3267,13 +3292,40 @@ def main(par):
    all_stats = pd.concat(all_logs_data, axis=1)
    logger.info("Log statistics were gathered for the following barcodes: %s", 
                ", ".join(all_stats.index.to_list()))
+    dtypes = {
+        'NumberOfInputReads': pd.UInt64Dtype(),
+        'NumberOfMappedReads': pd.UInt64Dtype(),
+        'PctMappedReads': pd.Float64Dtype(),
+        'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(),
+        'PectOfReadsMappedToMultipleLoci':  pd.Float64Dtype(), 
+        'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(),
+        'PectOfReadsMappedToTooManyLoci':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(),
+        'PectOfReadsUnmappedTooManyMismatches':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(), 
+        'PectOfReadsUnmappedTooShort':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedOther': pd.UInt64Dtype(),
+        'PectOfReadsUnmappedOther': pd.Float64Dtype(),
+        'ReadsWithValidBarcodes': pd.Float64Dtype(),
+        'SequencingSaturation': pd.Float64Dtype(),
+        'Q30BasesInCB+UMI': pd.Float64Dtype(),
+        'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(),
+        'EstimatedNumberOfCells': pd.UInt64Dtype(),
+        'FractionOfReadsInCells': pd.Float64Dtype(),
+        'MeanReadsPerCell': pd.UInt64Dtype(),
+        'NumberOfUMIs': pd.UInt64Dtype(),
+        'NumberOfGenes': pd.UInt64Dtype(),
+        'NumberOfCountedReads': pd.UInt64Dtype(),
+    }
+    all_stats = all_stats.astype(dtypes) 
    # batched() is used here to print a limited amount of columnns at a time
    # to make sure that they are all displayed (pandas might limit the view for readability)
    logger.info("Summary of final output:\\\\n%s\\\\n",
                "\\\\n".join(repr(all_stats.loc[:,columns].describe())
                          for columns in batched(all_stats.columns, 3))) 
    logger.info("Writing output to %s", par["output"])
-    all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True, index=False)
+    all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True,
+                                           index=False, float_format='%g')
    logger.info("Finished %s.", meta["name"])

 if __name__ == "__main__":
@@ -3358,7 +3410,11 @@ def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) {
              val = val.join(par.multiple_sep)
            }
            if (par.direction == "output" && par.type == "file") {
-              val = val.replaceAll('\\$id', id).replaceAll('\\$key', key)
+              val = val
+                .replaceAll('\\$id', id)
+                .replaceAll('\\$\\{id\\}', id)
+                .replaceAll('\\$key', key)
+                .replaceAll('\\$\\{key\\}', key)
            }
            [parName, val]
          }
@@ -3489,7 +3545,8 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
  def createParentStr = meta.config.allArguments
    .findAll { it.type == "file" && it.direction == "output" && it.create_parent }
    .collect { par -> 
-      "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"] : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }"
+      def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]"
+      "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }"
    }
    .join("\n")

@@ -3497,8 +3554,8 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
  def inputFileExports = meta.config.allArguments
    .findAll { it.type == "file" && it.direction.toLowerCase() == "input" }
    .collect { par ->
-      def viash_par_contents = "(viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName})"
-      "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}=\\\"\" + ${viash_par_contents} + \"\\\"\"}"
+      def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}"
+      "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}"
    }

  // NOTE: if using docker, use /tmp instead of tmpDir!
@@ -3535,6 +3592,7 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
  def procStr = 
  """nextflow.enable.dsl=2
  |
+  |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") }
  |process $procKey {$drctvStrs
  |input:
  |  tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources")
@@ -3546,10 +3604,9 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
  |$stub
  |\"\"\"
  |script:$assertStr
-  |def escapeText = { s -> s.toString().replaceAll('([`"])', '\\\\\\\\\$1') }
  |def parInject = args
  |  .findAll{key, value -> value != null}
-  |  .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}=\\\"\${escapeText(value)}\\\""}
+  |  .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"}
  |  .join("\\n")
  |\"\"\"
  |# meta exports