Files
htrnaseq/src/eset/create_pdata/test.py
CI 8d38735e19 Build branch htrnaseq/project_experimentes_uniqueness with version project_experimentes_uniqueness to htrnaseq on branch project_experimentes_uniqueness (93c3200)
Build pipeline: viash-hub.htrnaseq.project-experimentes-uniqueness-lcdnf

Source commit: 93c3200f38

Source message: Undo some changes
2025-11-19 15:13:51 +00:00

173 lines
9.3 KiB
Python

import pytest
import sys
import pandas as pd
from pathlib import Path
from uuid import uuid4
### VIASH START
meta = {
"resources_dir": "./src/eset/create_pdata/",
"executable": "target/executable/eset/create_pdata/create_pdata",
"config": "src/eset/create_pdata/config.vsh.yaml"
}
### VIASH END
@pytest.fixture
def test_reads_and_genes_per_chr_path():
return Path(meta["resources_dir"]) / "nrReadsNrGenesPerChromPool.txt"
@pytest.fixture
def test_star_logs_summary_path():
return Path(meta["resources_dir"]) / "starLogs.txt"
@pytest.fixture
def random_path(tmp_path):
def wrapper(extension=None):
extension = "" if not extension else f".{extension}"
return tmp_path / f"{uuid4()}{extension}"
return wrapper
def test_create_pdata(run_component, test_reads_and_genes_per_chr_path,
test_star_logs_summary_path, random_path):
output_path = random_path("tsv")
run_component([
"--star_stats_file", test_star_logs_summary_path,
"--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path,
"--output", output_path
])
assert output_path.is_file()
result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
expected_dict = {
'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT',
'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'],
'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'],
'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'],
'pctMT': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'],
'pctERCC': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470',
'7650', '9422'],
'pctChrom': ['100', '100', '100', '100', '100', '100', '100'],
'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158',
'126989', '142560'],
'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902',
'19272', '22129'],
'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72',
'15.18', '15.52'],
'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967',
'7141', '7045'],
'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52',
'5.62', '4.94'],
'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096',
'7280', '100515', '113324'],
'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48',
'71.67', '79.15', '79.49'],
'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'],
'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05',
'0.09', '0.05', '0.04'],
'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744',
'0.999803', '0.999843', '0.999783'],
'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872',
'0.0553191', '0.0667974', '0.060828'],
'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779',
'0.984451', '0.986581', '0.986622'],
'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969',
'0.066554', '0.0658665',
'0.0476472', '0.0616668',
'0.0676838'],
'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'],
'FractionOfReadsInCells': ['1', '1', '1', '1', '1', '1', '1'],
'MeanReadsPerCell': ['8538', '5862', '7389',
'10090', '470', '7650', '9420'],
'NumberOfUMIs': ['7942', '5472', '6859', '9403',
'444', '7139', '8847'],
'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'],
'NumberOfCountedReads': ['9535', '6463', '8299', '11273',
'533', '8444', '10383']
}
expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
pd.testing.assert_frame_equal(result, expected, check_like=True)
def test_na(run_component, test_reads_and_genes_per_chr_path,
test_star_logs_summary_path, random_path):
"""
The star log summary can contain NA values.
"""
output_path = random_path("tsv")
summary_with_na_path = random_path("txt")
original_summary = pd.read_csv(test_star_logs_summary_path,
sep="\t", index_col=0)
original_summary.loc["GTCTCGAGTG", "FractionOfReadsInCells"] = pd.NA
original_summary.reset_index("WellBC").to_csv(summary_with_na_path, sep="\t",
header=True, index=False)
run_component([
"--star_stats_file", summary_with_na_path,
"--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path,
"--output", output_path
])
expected_dict = {
'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT',
'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'],
'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'],
'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'],
'pctMT': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'],
'pctERCC': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470',
'7650', '9422'],
'pctChrom': ['100', '100', '100', '100', '100', '100', '100'],
'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158',
'126989', '142560'],
'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902',
'19272', '22129'],
'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72',
'15.18', '15.52'],
'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967',
'7141', '7045'],
'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52',
'5.62', '4.94'],
'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096',
'7280', '100515', '113324'],
'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48',
'71.67', '79.15', '79.49'],
'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'],
'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05',
'0.09', '0.05', '0.04'],
'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744',
'0.999803', '0.999843', '0.999783'],
'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872',
'0.0553191', '0.0667974', '0.060828'],
'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779',
'0.984451', '0.986581', '0.986622'],
'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969',
'0.066554', '0.0658665',
'0.0476472', '0.0616668',
'0.0676838'],
'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'],
'FractionOfReadsInCells': ['1.0', '1.0', '1.0', '1.0', pd.NA, '1.0', '1.0'],
'MeanReadsPerCell': ['8538', '5862', '7389',
'10090', '470', '7650', '9420'],
'NumberOfUMIs': ['7942', '5472', '6859', '9403',
'444', '7139', '8847'],
'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'],
'NumberOfCountedReads': ['9535', '6463', '8299', '11273',
'533', '8444', '10383']
}
result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
pd.testing.assert_frame_equal(result, expected, check_like=True)
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))