Build branch main with version main (3c05b79)

Build pipeline: viash-hub.htrnaseq.main-bc45k Source commit: 3c05b7950b Source message: Remove duplicate entries from feature data (#41)
2025-02-17 15:45:09 +00:00
parent 1f84d46798
commit 6468e4a5de
61 changed files with 144 additions and 76 deletions
--- a/target/executable/eset/create_fdata/.config.vsh.yaml
+++ b/target/executable/eset/create_fdata/.config.vsh.yaml
@@ -177,7 +177,7 @@ build_info:
  output: "target/executable/eset/create_fdata"
  executable: "target/executable/eset/create_fdata/create_fdata"
  viash_version: "0.9.0"
-  git_commit: "a0a780a9c1554e0e02de9e05bc88313594a08c6c"
+  git_commit: "3c05b7950b9627acdaa70687e0915d5ee69b6d1b"
  git_remote: "https://github.com/viash-hub/htrnaseq"
 package_config:
  name: "htrnaseq"
--- a/target/executable/eset/create_fdata/create_fdata
+++ b/target/executable/eset/create_fdata/create_fdata
@@ -478,9 +478,9 @@ RUN pip install --upgrade pip && \

 LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke"
 LABEL org.opencontainers.image.description="Companion container for running component eset create_fdata"
-LABEL org.opencontainers.image.created="2025-02-17T07:49:05Z"
+LABEL org.opencontainers.image.created="2025-02-17T14:58:42Z"
 LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq"
-LABEL org.opencontainers.image.revision="a0a780a9c1554e0e02de9e05bc88313594a08c6c"
+LABEL org.opencontainers.image.revision="3c05b7950b9627acdaa70687e0915d5ee69b6d1b"
 LABEL org.opencontainers.image.version="main"

 VIASHDOCKER
@@ -1162,10 +1162,17 @@ def main(par):
        idx, cols = pd.factorize(column_to_get)
        symbol_values = annotation.reindex(cols, axis=1).to_numpy()[np.arange(len(annotation)), idx]
        annotation["SYMBOL"] = symbol_values
-
-    logger.info("Writing to %s", par["output"])
+    logger.info("Dropping unused columns")
    annotation = annotation.drop(["score", "source", "frame", "feature"], axis=1)
+    logger.info("Looking for duplicate rows and removing them. Starting with %i entries", annotation.shape[0])
+    annotation = annotation.drop_duplicates(keep="first", ignore_index=True)
+    logger.info("After removing duplicates: %i entries", annotation.shape[0])
+    logger.info("Writing to %s", par["output"])
    annotation.to_csv(par["output"], sep="\\t", header=True, index=False, na_rep="NA")
+    # Do these checks *after* writing the csv in order to be able to check the data
+    logger.info("Checking for unique gene IDs")
+    if not annotation["gene_id"].is_unique:
+        raise ValueError("Values from the 'gene_id' column are not unique after processing!") 
    logger.info("%s finished", meta['name'])