import pytest import sys import pandas as pd from pathlib import Path from uuid import uuid4 ### VIASH START meta = { "resources_dir": "./src/eset/create_pdata/", "executable": "target/executable/eset/create_pdata/create_pdata", "config": "src/eset/create_pdata/config.vsh.yaml" } ### VIASH END @pytest.fixture def test_reads_and_genes_per_chr_path(): return Path(meta["resources_dir"]) / "nrReadsNrGenesPerChromPool.txt" @pytest.fixture def test_star_logs_summary_path(): return Path(meta["resources_dir"]) / "starLogs.txt" @pytest.fixture def random_path(tmp_path): def wrapper(extension=None): extension = "" if not extension else f".{extension}" return tmp_path / f"{uuid4()}{extension}" return wrapper def test_create_pdata(run_component, test_reads_and_genes_per_chr_path, test_star_logs_summary_path, random_path): output_path = random_path("tsv") run_component([ "--star_stats_file", test_star_logs_summary_path, "--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path, "--output", output_path ]) assert output_path.is_file() result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) expected_dict = { 'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT', 'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'], 'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'], 'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'], 'pctMT': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'], 'pctERCC': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470', '7650', '9422'], 'pctChrom': ['100', '100', '100', '100', '100', '100', '100'], 'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158', '126989', '142560'], 'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902', '19272', '22129'], 'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72', '15.18', '15.52'], 'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], 'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967', '7141', '7045'], 'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52', '5.62', '4.94'], 'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], 'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096', '7280', '100515', '113324'], 'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48', '71.67', '79.15', '79.49'], 'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'], 'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05', '0.09', '0.05', '0.04'], 'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744', '0.999803', '0.999843', '0.999783'], 'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872', '0.0553191', '0.0667974', '0.060828'], 'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779', '0.984451', '0.986581', '0.986622'], 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969', '0.066554', '0.0658665', '0.0476472', '0.0616668', '0.0676838'], 'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'], 'FractionOfReadsInCells': ['1', '1', '1', '1', '1', '1', '1'], 'MeanReadsPerCell': ['8538', '5862', '7389', '10090', '470', '7650', '9420'], 'NumberOfUMIs': ['7942', '5472', '6859', '9403', '444', '7139', '8847'], 'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'], 'NumberOfCountedReads': ['9535', '6463', '8299', '11273', '533', '8444', '10383'] } expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) pd.testing.assert_frame_equal(result, expected, check_like=True) def test_na(run_component, test_reads_and_genes_per_chr_path, test_star_logs_summary_path, random_path): """ The star log summary can contain NA values. """ output_path = random_path("tsv") summary_with_na_path = random_path("txt") original_summary = pd.read_csv(test_star_logs_summary_path, sep="\t", index_col=0) original_summary.loc["GTCTCGAGTG", "FractionOfReadsInCells"] = pd.NA original_summary.reset_index("WellBC").to_csv(summary_with_na_path, sep="\t", header=True, index=False) run_component([ "--star_stats_file", summary_with_na_path, "--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path, "--output", output_path ]) expected_dict = { 'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT', 'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'], 'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'], 'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'], 'pctMT': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'], 'pctERCC': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470', '7650', '9422'], 'pctChrom': ['100', '100', '100', '100', '100', '100', '100'], 'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158', '126989', '142560'], 'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902', '19272', '22129'], 'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72', '15.18', '15.52'], 'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], 'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967', '7141', '7045'], 'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52', '5.62', '4.94'], 'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], 'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], 'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096', '7280', '100515', '113324'], 'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48', '71.67', '79.15', '79.49'], 'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'], 'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05', '0.09', '0.05', '0.04'], 'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744', '0.999803', '0.999843', '0.999783'], 'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872', '0.0553191', '0.0667974', '0.060828'], 'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779', '0.984451', '0.986581', '0.986622'], 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969', '0.066554', '0.0658665', '0.0476472', '0.0616668', '0.0676838'], 'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'], 'FractionOfReadsInCells': ['1.0', '1.0', '1.0', '1.0', pd.NA, '1.0', '1.0'], 'MeanReadsPerCell': ['8538', '5862', '7389', '10090', '470', '7650', '9420'], 'NumberOfUMIs': ['7942', '5472', '6859', '9403', '444', '7139', '8847'], 'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'], 'NumberOfCountedReads': ['9535', '6463', '8299', '11273', '533', '8444', '10383'] } result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) pd.testing.assert_frame_equal(result, expected, check_like=True) if __name__ == '__main__': sys.exit(pytest.main([__file__]))