Build branch v0.3 with version v0.3.0 (e21130f)

Build pipeline: viash-hub.rnaseq.v0.3-6gfl7 Source commit: e21130ff7a Source message: Bump version to v0.3.0
2025-05-07 13:04:57 +00:00
commit 637ea108cd
573 changed files with 424840 additions and 0 deletions
--- a/src/cat_additional_fasta/script.py
+++ b/src/cat_additional_fasta/script.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Read a custom fasta file and create a custom GTF containing each entry
+"""
+from itertools import groupby
+import logging
+import os
+import sys
+
+## VIASH START
+par = {
+    "fasta": "testData/minimal_test/reference/genome.fasta",
+    "gtf": "testData/minimal_test/reference/genes.gtf",
+    "additional_fasta": "testData/minimal_test/reference/gfp.fa.gz",
+    "biotype": "gene_biotype", 
+    "fasta_output": "genome_gfp.fasta",
+    "gtf_output": "genome_gfp.gtf",
+}
+meta = {
+    "functionality_name": "cat_additonal_fasta"
+}
+## VIASH END
+
+def fasta_iter(fasta_name):
+    """
+    modified from Brent Pedersen
+    Correct Way To Parse A Fasta File In Python
+    given a fasta file. yield tuples of header, sequence
+
+    Fasta iterator from https://www.biostars.org/p/710/#120760
+    """
+    with open(fasta_name) as fh:
+        # ditch the boolean (x[0]) and just keep the header or sequence since
+        # we know they alternate.
+        faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
+        for header in faiter:
+            # drop the ">"
+            headerStr = header.__next__()[1:].strip()
+            # join all sequence lines to one.
+            seq = "".join(s.strip() for s in faiter.__next__())
+            yield (headerStr, seq)
+
+def fasta2gtf(fasta, output, biotype):
+    fiter = fasta_iter(fasta)
+    # GTF output lines
+    lines = []
+    attributes = 'exon_id "{name}.1"; exon_number "1";{biotype} gene_id "{name}_gene"; gene_name "{name}_gene"; gene_source "custom"; transcript_id "{name}_gene"; transcript_name "{name}_gene";\n'
+    line_template = "{name}\ttransgene\texon\t1\t{length}\t.\t+\t.\t" + attributes
+    for ff in fiter:
+        name, seq = ff
+        # Use first ID as separated by spaces as the "sequence name"
+        # (equivalent to "chromosome" in other cases)
+        seqname = name.split()[0]
+        # Remove all spaces
+        name = seqname.replace(" ", "_")
+        length = len(seq)
+        biotype_attr = ""
+        if biotype:
+            biotype_attr = f' {biotype} "transgene";'
+        line = line_template.format(name=name, length=length, biotype=biotype_attr)
+        lines.append(line)
+    with open(output, "w") as f:
+        f.write("".join(lines))
+
+add_name = os.path.basename(par['additional_fasta'])
+output = os.path.splitext(add_name)[0] + ".gtf"
+fasta2gtf(par['additional_fasta'], output, par['biotype'])
+
+with open(par['fasta'], 'r') as f1:
+    content1 = f1.read()
+with open(par['additional_fasta'], 'r') as f2:
+    content2 = f2.read()
+with open(par['fasta_output'], 'w') as f_out:
+    f_out.write(content1 + content2)
+with open(par['gtf'], 'r') as g1:
+    g_content1 = g1.read()
+with open(output, 'r') as g2:
+    g_content2 = g2.read()
+with open(par['gtf_output'], 'w') as g_out:
+    g_out.write(g_content1 + g_content2)