diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py index cab080b6..024b9ab2 100644 --- a/bio2zarr/__main__.py +++ b/bio2zarr/__main__.py @@ -15,6 +15,7 @@ def bio2zarr(): # is handy for development and for those whose PATHs aren't set # up in the right way. bio2zarr.add_command(cli.vcf2zarr_main) +bio2zarr.add_command(cli.bed2zarr_main) bio2zarr.add_command(cli.plink2zarr) bio2zarr.add_command(cli.vcfpartition) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py new file mode 100644 index 00000000..de36a3c0 --- /dev/null +++ b/bio2zarr/bed2zarr.py @@ -0,0 +1,229 @@ +import dataclasses +import logging +import math +from enum import Enum +from typing import Any + +import numcodecs +import numpy as np +import pandas as pd +import zarr + +from . import core, provenance + +logger = logging.getLogger(__name__) + +DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7) +BED_ZARR_VERSION = 0.1 + + +class BedType(Enum): + BED3 = 3 + BED4 = 4 + BED5 = 5 + BED6 = 6 + BED7 = 7 + BED8 = 8 + BED9 = 9 + BED12 = 12 + + +@dataclasses.dataclass +class BedFieldSummary(core.JsonDataclass): + num_chunks: int = 0 + compressed_size: int = 0 + uncompressed_size: int = 0 + max_value: Any = -math.inf + min_value: Any = math.inf + + def update(self, other): + self.num_chunks = other.num_chunks + self.compressed_size = other.compressed_size + self.uncompressed_size = other.uncompressed_size + self.min_value = min(self.min_value, other.min_value) + self.max_value = max(self.max_value, other.max_value) + + @staticmethod + def fromdict(d): + return BedFieldSummary(**d) + + +@dataclasses.dataclass +class BedField: + category: str + name: str + alias: str + description: str + bed_dtype: str + summary: BedFieldSummary + + def smallest_dtype(self): + """Return the smallest dtype suitable for this field based on + type and values""" + s = self.summary + if self.bed_dtype == "Integer": + if not math.isfinite(s.max_value): + ret = "i1" + else: + ret = core.min_int_dtype(s.min_value, s.max_value) + elif self.bed_dtype == "Character": + ret = "U1" + elif self.bed_dtype == "Category": + ret = core.min_int_dtype(s.min_value, s.max_value) + else: + assert self.bed_dtype == "String" + ret = "O" + return ret + + +def guess_bed_file_type(path): + """Check the number of columns in a BED file and guess BED + type.""" + first = pd.read_table(path, header=None, nrows=1) + num_cols = first.shape[1] + if num_cols < 3: + raise ValueError(f"Expected at least 3 columns in BED file, got {num_cols}") + if num_cols > 12: + raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}") + if num_cols in (10, 11): + raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns") + if num_cols in (9, 12): + raise ValueError("BED9 and BED12 are valid but currently unsupported formats") + return BedType(num_cols) + + +# See https://samtools.github.io/hts-specs/BEDv1.pdf +def mandatory_bed_field_definitions(): + def make_field_def(name, alias, bed_dtype, description=""): + return BedField( + category="mandatory", + name=name, + alias=alias, + description=description, + bed_dtype=bed_dtype, + summary=BedFieldSummary(), + ) + + fields = [ + make_field_def("contig", "chrom", "Category", "Chromosome name"), + make_field_def("start", "chromStart", "Integer", "Name start position"), + make_field_def("end", "chromEnd", "Integer", "Name end position"), + ] + return fields + + +def optional_bed_field_definitions(num_fields=0): + def make_field_def(name, alias, bed_dtype, description=""): + return BedField( + category="optional", + name=name, + alias=alias, + description=description, + bed_dtype=bed_dtype, + summary=BedFieldSummary(), + ) + + fields = [ + make_field_def("name", "name", "Category", "Name description"), + make_field_def("score", "score", "Integer", "A numerical value"), + make_field_def("strand", "strand", "Character", "Name strand"), + make_field_def("thickStart", "thickStart", "Integer", "Thick start position"), + make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"), + make_field_def("itemRgb", "itemRgb", "Integer", "Display"), + make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"), + make_field_def("blockSizes", "blockSizes", "Integer", "Block sizes"), + make_field_def( + "blockStarts", "blockStarts", "Integer", "Block start positions" + ), + ] + + return fields[:num_fields] + + +def mkfields(bed_type): + mandatory = mandatory_bed_field_definitions() + optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value) + return mandatory + optional + + +def encode_categoricals(data, bed_type): + """Convert categoricals to integer encodings.""" + contig = pd.Categorical( + data["contig"], categories=data["contig"].unique(), ordered=True + ) + contig_id = contig.categories.values + contig_id = contig_id.astype(f"= BedType.BED4.value: + name = pd.Categorical( + data["name"], categories=data["name"].unique(), ordered=True + ) + name_id = name.categories.values + name_id = name_id.astype(f"= BedType.BED4.value: + root.array("name_id", name_id, chunks=(len(name_id),)) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 0722521d..aec6811c 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -8,7 +8,7 @@ import numcodecs import tabulate -from . import plink, provenance, vcf2zarr, vcf_utils +from . import bed2zarr, plink, provenance, vcf2zarr, vcf_utils from .vcf2zarr import icf as icf_mod logger = logging.getLogger(__name__) @@ -128,6 +128,14 @@ def list_commands(self, ctx): help="Chunk size in the samples dimension", ) +records_chunk_size = click.option( + "-r", + "--records-chunk-size", + type=int, + default=None, + help="Chunk size in the records dimension", +) + schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True)) max_variant_chunks = click.option( @@ -574,6 +582,39 @@ def plink2zarr(): plink2zarr.add_command(convert_plink) +@click.command +@version +@click.argument( + "bed_path", + type=click.Path(exists=True, dir_okay=False), +) +@new_zarr_path +@records_chunk_size +@verbose +@force +def bed2zarr_main( + bed_path, + zarr_path, + records_chunk_size, + verbose, + force, +): + """ + Convert BED file to the Zarr format. Each BED column will be + converted to a Zarr array with appropriate encoding. + + The BED file must be compressed and tabix-indexed. BED9 and BED12 + formats are currently not supported. + """ + setup_logging(verbose) + check_overwrite_dir(zarr_path, force) + bed2zarr.bed2zarr( + bed_path, + zarr_path, + records_chunk_size=records_chunk_size, + ) + + @click.command @version @vcfs diff --git a/pyproject.toml b/pyproject.toml index 08bc9742..d3c07fef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ # colouredlogs pulls in humanfriendly", "cyvcf2", "bed_reader", + "pandas", ] requires-python = ">=3.9" classifiers = [ @@ -45,6 +46,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr" documentation = "https://sgkit-dev.github.io/bio2zarr/" [project.scripts] +bed2zarr = "bio2zarr.cli:bed2zarr_main" vcf2zarr = "bio2zarr.cli:vcf2zarr_main" vcfpartition = "bio2zarr.cli:vcfpartition" diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz new file mode 100644 index 00000000..8713cdfc Binary files /dev/null and b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz differ diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi new file mode 100644 index 00000000..e9288838 Binary files /dev/null and b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi differ diff --git a/tests/data/bed/sample_mask.bed.gz b/tests/data/bed/sample_mask.bed.gz new file mode 100644 index 00000000..b89e813b Binary files /dev/null and b/tests/data/bed/sample_mask.bed.gz differ diff --git a/tests/data/bed/sample_mask.bed.gz.csi b/tests/data/bed/sample_mask.bed.gz.csi new file mode 100644 index 00000000..2d96b927 Binary files /dev/null and b/tests/data/bed/sample_mask.bed.gz.csi differ diff --git a/tests/test_bed.py b/tests/test_bed.py new file mode 100644 index 00000000..3f162679 --- /dev/null +++ b/tests/test_bed.py @@ -0,0 +1,131 @@ +import numpy as np +import pandas as pd +import pytest +import zarr + +from bio2zarr import bed2zarr + +ALLOWED_BED_FORMATS = [3, 4, 5, 6, 7, 8, 9, 12] +SUPPORTED_BED_FORMATS = [3, 4, 5, 6, 7, 8] +DISALLOWED_BED_FORMATS = [2, 10, 11, 13] +ALL_BED_FORMATS = ALLOWED_BED_FORMATS + DISALLOWED_BED_FORMATS + + +@pytest.fixture() +def bed_data(request): + data = [ + [ + "chr22", + 1000, + 5000, + "cloneA", + 960, + "+", + 1000, + 5000, + 0, + 2, + "567,488", + "0,3512", + "foo", + ], + [ + "chr22", + 2000, + 6000, + "cloneB", + 900, + "-", + 2000, + 6000, + 0, + 2, + "433,399", + "0,3601", + "bar", + ], + ] + return [row[0 : request.param] for row in data] + + +@pytest.fixture() +def bed_df(bed_data): + return pd.DataFrame(bed_data) + + +@pytest.fixture() +def bed_path(bed_data, tmp_path): + out = tmp_path / "sample.bed" + with open(out, "w") as fh: + for row in bed_data: + fh.write("\t".join(map(str, row)) + "\n") + return out + + +class TestBed2Zarr: + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) + def test_bed2zarr(self, bed_path, bed_df, tmp_path, request): + bedspec = request.node.callspec.params["bed_data"] + bed_type = bed2zarr.BedType(bedspec) + zarr_path = tmp_path / "test.zarr" + bed2zarr.bed2zarr(bed_path=bed_path, zarr_path=zarr_path) + root = zarr.open(zarr_path) + np.testing.assert_array_equal(root["contig"][:], [0, 0]) + np.testing.assert_array_equal(root["contig_id"][:], ["chr22"]) + np.testing.assert_array_equal(root["start"][:], bed_df[1].values) + np.testing.assert_array_equal(root["end"][:], bed_df[2].values) + if bed_type.value >= bed2zarr.BedType.BED4.value: + np.testing.assert_array_equal(root["name"][:], [0, 1]) + np.testing.assert_array_equal(root["name_id"][:], bed_df[3].values) + if bed_type.value >= bed2zarr.BedType.BED5.value: + np.testing.assert_array_equal(root["score"][:], bed_df[4].values) + if bed_type.value >= bed2zarr.BedType.BED6.value: + np.testing.assert_array_equal(root["strand"][:], bed_df[5].values) + if bed_type.value >= bed2zarr.BedType.BED7.value: + np.testing.assert_array_equal(root["thickStart"][:], bed_df[6].values) + if bed_type.value >= bed2zarr.BedType.BED8.value: + np.testing.assert_array_equal(root["thickEnd"][:], bed_df[7].values) + print(zarr_path) + + +class TestBedData: + @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True) + def test_guess_bed_type_from_path(self, bed_path, request): + bedspec = request.node.callspec.params["bed_data"] + match_str = rf"(got {bedspec}|prohibited|unsupported)" + if bedspec not in SUPPORTED_BED_FORMATS: + with pytest.raises(ValueError, match=match_str): + bed2zarr.guess_bed_file_type(bed_path) + else: + bed_type = bed2zarr.guess_bed_file_type(bed_path) + assert bed_type.value == bedspec + + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) + def test_bed_fields(self, bed_path, request): + bedspec = request.node.callspec.params["bed_data"] + fields = bed2zarr.mkfields(bed2zarr.BedType(bedspec)) + assert len(fields) == bedspec + + +class TestSampleMaskBed: + bed_path = "tests/data/bed/sample_mask.bed.gz" + + @pytest.fixture() + def zarr_path(self, tmp_path): + out = tmp_path / "sample_mask.zarr" + return out + + def test_bed2zarr(self, zarr_path): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path) + + +class Test1kgBed: + bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" + + @pytest.fixture() + def zarr_path(self, tmp_path): + out = tmp_path / "1kg_2020_chr20_annotations_mask.zarr" + return out + + def test_bed2zarr(self, zarr_path): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path) diff --git a/tests/test_core.py b/tests/test_core.py index 16e2c0c0..0484fd22 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -202,7 +202,7 @@ def test_5_chunk_1(self, n, expected): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4977391), + ("tests/data", 4989501), ("tests/data/vcf", 4965254), ("tests/data/vcf/sample.vcf.gz", 1089), ],