Files
htrnaseq/src/eset/create_fdata/test.py
CI 17f8fccf2a Build branch fix_create_report_build with version fix_create_report_build (af02c4b)
Build pipeline: viash-hub.htrnaseq.fix-create-report-build-hgl2d

Source commit: af02c4b8d7

Source message: Update CHANGELOG
2025-04-29 10:17:56 +00:00

102 lines
4.0 KiB
Python

import pytest
import sys
import pandas as pd
from pathlib import Path
from uuid import uuid4
from shutil import copyfile
### VIASH START
meta = {
"resources_dir": "./src/eset/create_fdata/",
"executable": "target/executable/eset/create_fdata/create_fdata",
"config": "src/eset/create_fdata/config.vsh.yaml"
}
### VIASH END
@pytest.fixture
def test_annotation_path():
return Path(meta["resources_dir"]) / "test_annotation.gtf"
@pytest.fixture
def random_path(tmp_path):
def wrapper(extension=None):
extension = "" if not extension else f".{extension}"
return tmp_path / f"{uuid4()}{extension}"
return wrapper
def test_create_fdata(run_component, test_annotation_path, random_path):
output_path = random_path("tsv")
run_component([
"--gtf", test_annotation_path,
"--output", output_path
])
assert output_path.is_file()
result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
expected_dict = {
"seqname": ["20", "20", "20", "21"],
"start": ["87250", "142590", "157454", "297570"],
"end": ["97094", "145751", "159163", "300321"],
"strand": ["+", "+", "+", "+"],
"gene_id": ["ENSG00000178591", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"gene_version": ["7", "6", "5", "4"],
"gene_name": ["DEFB125", "DEFB126", "DEFB127", pd.NA],
"gene_source": ["ensembl_havana", "ensembl_havana",
"ensembl_havana", "havana"],
"gene_biotype": ["protein_coding", "protein_coding",
"protein_coding", "protein_coding"],
"ENSEMBL_with_version": ["ENSG00000178591.7", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"ENSEMBL": ["ENSG00000178591", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"SYMBOL": ["DEFB125", "DEFB126", "DEFB127", pd.NA]
}
expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
pd.testing.assert_frame_equal(expected, result, check_like=True)
def test_make_unique(run_component, test_annotation_path, random_path):
gtf_with_duplicate_entry_path = random_path("gtf")
output_path = random_path("tsv")
entry_to_add = (
"\n20 ensembl_havana gene 87250 97094 . + . gene_id " +
"\"ENSG00000178591.7\"; gene_version \"7\"; gene_name \"DEFB125\"; " +
"gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\";\n"
)
copyfile(test_annotation_path, gtf_with_duplicate_entry_path)
with gtf_with_duplicate_entry_path.open("a") as open_gtf:
open_gtf.write(entry_to_add)
run_component([
"--gtf", gtf_with_duplicate_entry_path,
"--output", output_path
])
assert output_path.is_file()
result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
expected_dict = {
"seqname": ["20", "20", "20", "21"],
"start": ["87250", "142590", "157454", "297570"],
"end": ["97094", "145751", "159163", "300321"],
"strand": ["+", "+", "+", "+"],
"gene_id": ["ENSG00000178591", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"gene_version": ["7", "6", "5", "4"],
"gene_name": ["DEFB125", "DEFB126", "DEFB127", pd.NA],
"gene_source": ["ensembl_havana", "ensembl_havana",
"ensembl_havana", "havana"],
"gene_biotype": ["protein_coding", "protein_coding",
"protein_coding", "protein_coding"],
"ENSEMBL_with_version": ["ENSG00000178591.7", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"ENSEMBL": ["ENSG00000178591", "ENSG00000125788",
"ENSG00000088782", "ENSG00000247315"],
"SYMBOL": ["DEFB125", "DEFB126", "DEFB127", pd.NA]
}
expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
pd.testing.assert_frame_equal(expected, result, check_like=True)
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))