Build branch main with version main (65dd41d)

Build pipeline: viash-hub.htrnaseq.main-vhms8

Source commit: 65dd41d8b1

Source message: Optimize spawning of processes
This commit is contained in:
CI
2024-11-05 17:26:35 +00:00
parent b8abf8c490
commit f2ff92c6ac
127 changed files with 29560 additions and 1004 deletions

View File

@@ -1,8 +1,8 @@
// combine_star_logs main
//
// This wrapper script is auto-generated by viash 0.9.0-RC7 and is thus a
// derivative work thereof. This software comes with ABSOLUTELY NO WARRANTY from
// Data Intuitive.
// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative
// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
// Intuitive.
//
// The component may contain files which fall under a different license. The
// authors of this component should specify the license in the header of such
@@ -1728,7 +1728,9 @@ def publishStates(Map args) {
def yamlFilename = yamlTemplate_
.replaceAll('\\$id', id_)
.replaceAll('\\$\\{id\\}', id_)
.replaceAll('\\$key', key_)
.replaceAll('\\$\\{key\\}', key_)
// TODO: do the pathnames in state_ match up with the outputFilenames_?
@@ -1799,7 +1801,9 @@ def publishStatesByConfig(Map args) {
def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml'
def yamlFilename = yamlTemplate
.replaceAll('\\$id', id_)
.replaceAll('\\$\\{id\\}', id_)
.replaceAll('\\$key', key_)
.replaceAll('\\$\\{key\\}', key_)
def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent()
// the processed state is a list of [key, value, inputPath, outputFilename] tuples, where
@@ -1841,7 +1845,9 @@ def publishStatesByConfig(Map args) {
// instantiate the template
def filename = filenameTemplate
.replaceAll('\\$id', id_)
.replaceAll('\\$\\{id\\}', id_)
.replaceAll('\\$key', key_)
.replaceAll('\\$\\{key\\}', key_)
if (par.multiple) {
// if the parameter is multiple: true, the filename
// should contain a wildcard '*' that is replaced with
@@ -2809,7 +2815,7 @@ meta = [
"type" : "string",
"name" : "--barcodes",
"description" : "Barcodes responding to the respective log files.\n",
"required" : false,
"required" : true,
"direction" : "input",
"multiple" : true,
"multiple_sep" : ";"
@@ -2823,7 +2829,7 @@ meta = [
],
"must_exist" : true,
"create_parent" : true,
"required" : false,
"required" : true,
"direction" : "input",
"multiple" : true,
"multiple_sep" : ";"
@@ -2837,7 +2843,7 @@ meta = [
],
"must_exist" : true,
"create_parent" : true,
"required" : false,
"required" : true,
"direction" : "input",
"multiple" : true,
"multiple_sep" : ";"
@@ -2848,7 +2854,7 @@ meta = [
"description" : "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n",
"must_exist" : true,
"create_parent" : true,
"required" : false,
"required" : true,
"direction" : "input",
"multiple" : true,
"multiple_sep" : ";"
@@ -3026,15 +3032,23 @@ meta = [
"runner" : "nextflow",
"engine" : "docker|native",
"output" : "target/nextflow/stats/combine_star_logs",
"viash_version" : "0.9.0-RC7",
"git_commit" : "cf9797232db1306bfd5696287928cababe317d99",
"git_remote" : "https://x-access-token:ghs_KjB7pWu8DQM3iFulLu7RI06qnt5K8S1A0eaE@github.com/viash-hub/htrnaseq"
"viash_version" : "0.9.0",
"git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
"git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
},
"package_config" : {
"name" : "htrnaseq",
"version" : "main",
"description" : "High-throughput pipeline [WIP]\n",
"viash_version" : "0.9.0-RC7",
"info" : {
"test_resources" : [
{
"path" : "gs://viash-hub-test-data/htrnaseq/v1/",
"dest" : "resources_test"
}
]
},
"viash_version" : "0.9.0",
"source" : "src",
"target" : "target",
"config_mods" : [
@@ -3072,7 +3086,6 @@ tempscript=".viash_script.sh"
cat > "$tempscript" << VIASHMAIN
import logging
import pandas as pd
import numpy as np
from itertools import batched, starmap
### VIASH START
@@ -3119,7 +3132,7 @@ logger.setLevel(logging.DEBUG)
def handle_percentages(column_value):
# TODO: handle this more gracefully
if column_value:
return np.float64(column_value.strip('%'))
return column_value.strip('%')
return column_value
def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame:
@@ -3137,7 +3150,7 @@ def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame:
logger.info("Reading summary log %s for barcode %s", summary_path, barcode)
result = pd.read_table(summary_path, sep=",",
header=None, names=["Category", "Value"],
index_col=0)
index_col=0, dtype=pd.StringDtype())
logger.info("Read %d row(s) and %d column(s) from summary file at %s",
*result.shape, summary_path)
return result
@@ -3146,9 +3159,14 @@ def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame:
def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame:
logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode)
result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\\\\t",
dtype={"geneID": pd.StringDtype(),
"Unstranded": pd.Int64Dtype(),
"posStrand": pd.Int64Dtype(),
"negStrand": pd.Int64Dtype()},
index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"])
result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series
df = pd.DataFrame({"Value": result.sum()})
df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise")
df.index.name = "Category"
logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s",
*df.shape, read_per_gene_path)
@@ -3177,12 +3195,16 @@ def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame
"\\\\n\\\\t".join(to_keep[~to_keep].index.to_list()))
result = df.loc[to_keep]
# Replace % by pect, remove columns, use camel case and remove spaces
# You might be tempted to use .title() to make everything uppercase,
# but characters which are already uppercase should stay that way.
# (example: NumberOfUMIs and not NumberOfUmis)
result.index = result.index.str.replace("%", "pect")\\\\
.str.replace(":", "")\\\\
.str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(), regex=True)\\\\
.str.replace(" ", "")
result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads",
"UniquelyMappedReadsPect": "pctMappedReads"}, errors="raise")
"UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise")
logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). "
"Found entries:\\\\n\\\\t%s",
barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list()))
@@ -3198,13 +3220,9 @@ def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame)
"Reads Mapped to Genome: Unique",
"Reads Mapped to Transcriptome: Unique Genes",
"Reads in Cells Mapped to Unique Genes",
"Mean Reads per Cell",
"Median UMI per Cell",
"Median Genes per Cell",
"Q30 Bases in CB+UMI",
"Reads Mapped to Genome: Unique+Multiple",
"Reads Mapped to Transcriptome: Unique+Multipe Genes",
"Fraction of Reads in Cells",
"Median Reads per Cell",
"Mean UMI per Cell",
"Mean Genes per Cell",
@@ -3217,8 +3235,15 @@ def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame)
result = df.loc[to_keep]
result.index = result.index.str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(),
regex=True).str.replace(" ", "")
result = result.rename({"UMIsInCells": "NumberOfUMIs",
"TotalGenesDetected": "NumberOfGenes"}, errors="raise")
to_rename = {"UMIsInCells": "NumberOfUMIs",
"TotalGenesDetected": "NumberOfGenes"}
try:
result = result.rename(to_rename, errors="raise")
except KeyError as e:
raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary "
f"log for barcode {barcode}, but an entry was not found in the file. "
"Make sure that you are using the correct version of STAR."
f"Available entries: {", ".join(result.index.to_list())}") from e
logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). "
"Found entries:\\\\n\\\\t%s",
barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list()))
@@ -3267,13 +3292,40 @@ def main(par):
all_stats = pd.concat(all_logs_data, axis=1)
logger.info("Log statistics were gathered for the following barcodes: %s",
", ".join(all_stats.index.to_list()))
dtypes = {
'NumberOfInputReads': pd.UInt64Dtype(),
'NumberOfMappedReads': pd.UInt64Dtype(),
'PctMappedReads': pd.Float64Dtype(),
'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(),
'PectOfReadsMappedToMultipleLoci': pd.Float64Dtype(),
'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(),
'PectOfReadsMappedToTooManyLoci': pd.Float64Dtype(),
'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(),
'PectOfReadsUnmappedTooManyMismatches': pd.Float64Dtype(),
'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(),
'PectOfReadsUnmappedTooShort': pd.Float64Dtype(),
'NumberOfReadsUnmappedOther': pd.UInt64Dtype(),
'PectOfReadsUnmappedOther': pd.Float64Dtype(),
'ReadsWithValidBarcodes': pd.Float64Dtype(),
'SequencingSaturation': pd.Float64Dtype(),
'Q30BasesInCB+UMI': pd.Float64Dtype(),
'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(),
'EstimatedNumberOfCells': pd.UInt64Dtype(),
'FractionOfReadsInCells': pd.Float64Dtype(),
'MeanReadsPerCell': pd.UInt64Dtype(),
'NumberOfUMIs': pd.UInt64Dtype(),
'NumberOfGenes': pd.UInt64Dtype(),
'NumberOfCountedReads': pd.UInt64Dtype(),
}
all_stats = all_stats.astype(dtypes)
# batched() is used here to print a limited amount of columnns at a time
# to make sure that they are all displayed (pandas might limit the view for readability)
logger.info("Summary of final output:\\\\n%s\\\\n",
"\\\\n".join(repr(all_stats.loc[:,columns].describe())
for columns in batched(all_stats.columns, 3)))
logger.info("Writing output to %s", par["output"])
all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True, index=False)
all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True,
index=False, float_format='%g')
logger.info("Finished %s.", meta["name"])
if __name__ == "__main__":
@@ -3358,7 +3410,11 @@ def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) {
val = val.join(par.multiple_sep)
}
if (par.direction == "output" && par.type == "file") {
val = val.replaceAll('\\$id', id).replaceAll('\\$key', key)
val = val
.replaceAll('\\$id', id)
.replaceAll('\\$\\{id\\}', id)
.replaceAll('\\$key', key)
.replaceAll('\\$\\{key\\}', key)
}
[parName, val]
}
@@ -3489,7 +3545,8 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
def createParentStr = meta.config.allArguments
.findAll { it.type == "file" && it.direction == "output" && it.create_parent }
.collect { par ->
"\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"] : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }"
def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]"
"\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }"
}
.join("\n")
@@ -3497,8 +3554,8 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
def inputFileExports = meta.config.allArguments
.findAll { it.type == "file" && it.direction.toLowerCase() == "input" }
.collect { par ->
def viash_par_contents = "(viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName})"
"\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}=\\\"\" + ${viash_par_contents} + \"\\\"\"}"
def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}"
"\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}"
}
// NOTE: if using docker, use /tmp instead of tmpDir!
@@ -3535,6 +3592,7 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
def procStr =
"""nextflow.enable.dsl=2
|
|def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") }
|process $procKey {$drctvStrs
|input:
| tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources")
@@ -3546,10 +3604,9 @@ def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) {
|$stub
|\"\"\"
|script:$assertStr
|def escapeText = { s -> s.toString().replaceAll('([`"])', '\\\\\\\\\$1') }
|def parInject = args
| .findAll{key, value -> value != null}
| .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}=\\\"\${escapeText(value)}\\\""}
| .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"}
| .join("\\n")
|\"\"\"
|# meta exports