diff --git a/.gitignore b/.gitignore index 6122dbf..fbddbf4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /vendor /composer.lock /.phpunit.result.cache +/.claude/settings.local.json diff --git a/src/GenomicPosition.php b/src/GenomicPosition.php index bdf4eb3..1ee1eaf 100644 --- a/src/GenomicPosition.php +++ b/src/GenomicPosition.php @@ -10,24 +10,20 @@ class GenomicPosition public int $position; - public function __construct(Chromosome $chromosome, int $position) + public function __construct(Chromosome $chromosome, NucleotidePosition $position) { - if ($position < 1) { - throw new \InvalidArgumentException("Position must be positive, got: {$position}."); - } - $this->chromosome = $chromosome; - $this->position = $position; + $this->position = $position->value; } - /** @example GenomicPosition::parse('chr1:123456') */ - public static function parse(string $value): self + /** @example GenomicPosition::parseOneBased('chr1:123456') */ + public static function parseOneBased(string $value): self { if (preg_match('/^([^:]+):(g\.|)(\d+)$/', $value, $matches) === 0) { throw new \InvalidArgumentException("Invalid genomic position format: {$value}. Expected format: chr1:123456."); } - return new self(new Chromosome($matches[1]), (int) $matches[3]); + return new self(new Chromosome($matches[1]), NucleotidePosition::fromOneBased((int) $matches[3])); } public function equals(self $other): bool diff --git a/src/GenomicRegion.php b/src/GenomicRegion.php index 3acf3a1..bb7bed5 100644 --- a/src/GenomicRegion.php +++ b/src/GenomicRegion.php @@ -14,27 +14,19 @@ class GenomicRegion public function __construct( Chromosome $chromosome, - int $start, - int $end + NucleotidePosition $start, + NucleotidePosition $end ) { - if ($start < 1) { - throw new \InvalidArgumentException("Start must be positive, got: {$start}."); - } - - if ($end < 1) { - throw new \InvalidArgumentException("End must be positive, got: {$end}."); - } - - if ($start > $end) { - throw new \InvalidArgumentException("End ({$end}) must not be less than start ({$start})."); + if ($start->value > $end->value) { + throw new \InvalidArgumentException("End ({$end->value}) must not be less than start ({$start->value})."); } $this->chromosome = $chromosome; - $this->start = $start; - $this->end = $end; + $this->start = $start->value; + $this->end = $end->value; } - public static function parse(string $value): self + public static function parseOneBased(string $value): self { if (preg_match('/^([^:]+):(g\.|)(\d+)(-(\d+)|)$/', $value, $matches) === 0) { throw new \InvalidArgumentException("Invalid genomic region format: {$value}. Expected format: chr1:123-456."); @@ -42,8 +34,8 @@ public static function parse(string $value): self return new self( new Chromosome($matches[1]), - (int) $matches[3], - (int) ($matches[5] ?? $matches[3]) + NucleotidePosition::fromOneBased((int) $matches[3]), + NucleotidePosition::fromOneBased((int) ($matches[5] ?? $matches[3])) ); } @@ -98,13 +90,40 @@ public function intersection(self $other): ?self return new self( $this->chromosome, - max($this->start, $other->start), - min($this->end, $other->end) + NucleotidePosition::fromOneBased(max($this->start, $other->start)), + NucleotidePosition::fromOneBased(min($this->end, $other->end)) + ); + } + + /** Constructs a 1-based closed region from 0-based half-open coordinates (BED, BAM, bigWig). */ + public static function fromZeroBasedHalfOpen(string $chromosome, int $start, int $end): self + { + return new self( + new Chromosome($chromosome), + NucleotidePosition::fromZeroBased($start), + NucleotidePosition::fromOneBased($end) ); } + /** @return array{Chromosome, int, int} Chromosome, 0-based start, half-open end. */ + public function toZeroBasedHalfOpen(): array + { + return [$this->chromosome, $this->start - 1, $this->end]; + } + private function containsCoordinate(int $position): bool { return $position >= $this->start && $position <= $this->end; } + + /** @return array */ + public function genomicPositions(): array + { + $items = []; + for ($genomicPosition = $this->start; $genomicPosition <= $this->end; ++$genomicPosition) { + $items[] = new GenomicPosition($this->chromosome, NucleotidePosition::fromOneBased($genomicPosition)); + } + + return $items; + } } diff --git a/src/NucleotidePosition.php b/src/NucleotidePosition.php new file mode 100644 index 0000000..636db93 --- /dev/null +++ b/src/NucleotidePosition.php @@ -0,0 +1,35 @@ +value = $oneBasedPosition; + } + + public static function fromOneBased(int $position): self + { + if ($position < 1) { + throw new \InvalidArgumentException("Position must be positive, got: {$position}."); + } + + $instance = new self($position); + + return $instance; + } + + public static function fromZeroBased(int $position): self + { + if ($position < 0) { + throw new \InvalidArgumentException("Position must not be negative, got: {$position}."); + } + + $instance = new self($position + 1); + + return $instance; + } +} diff --git a/tests/GenomicPositionTest.php b/tests/GenomicPositionTest.php index 6523235..efd07be 100644 --- a/tests/GenomicPositionTest.php +++ b/tests/GenomicPositionTest.php @@ -1,8 +1,8 @@ toString(new NamingConvention(NamingConvention::UCSC))); } public function testParseEnsembl(): void { - $position = GenomicPosition::parse('11:1'); + $position = GenomicPosition::parseOneBased('11:1'); self::assertSame('11:1', $position->toString(new NamingConvention(NamingConvention::ENSEMBL))); } public function testParseHGVSg(): void { - $position = GenomicPosition::parse('chr11:g.1'); + $position = GenomicPosition::parseOneBased('chr11:g.1'); self::assertSame('chr11:1', $position->toString(new NamingConvention(NamingConvention::UCSC))); } public function testOutputInBothConventions(): void { - $position = GenomicPosition::parse('chr11:12345'); + $position = GenomicPosition::parseOneBased('chr11:12345'); self::assertSame('chr11:12345', $position->toString(new NamingConvention(NamingConvention::UCSC))); self::assertSame('11:12345', $position->toString(new NamingConvention(NamingConvention::ENSEMBL))); } @@ -36,21 +36,28 @@ public function testOutputInBothConventions(): void public function testEquals(): void { self::assertTrue( - GenomicPosition::parse('chr11:100')->equals(GenomicPosition::parse('11:100')) + GenomicPosition::parseOneBased('chr11:100')->equals(GenomicPosition::parseOneBased('11:100')) ); self::assertFalse( - GenomicPosition::parse('chr11:100')->equals(GenomicPosition::parse('chr11:101')) + GenomicPosition::parseOneBased('chr11:100')->equals(GenomicPosition::parseOneBased('chr11:101')) ); self::assertFalse( - GenomicPosition::parse('chr11:100')->equals(GenomicPosition::parse('chr12:100')) + GenomicPosition::parseOneBased('chr11:100')->equals(GenomicPosition::parseOneBased('chr12:100')) ); } - public function testConstructorRejectsNonPositivePosition(): void + public function testConstructorRejectsZeroPosition(): void { self::expectException(\InvalidArgumentException::class); self::expectExceptionMessage('Position must be positive, got: 0.'); - new GenomicPosition(new Chromosome('chr1'), 0); + NucleotidePosition::fromOneBased(0); + } + + public function testConstructorRejectsNegativePosition(): void + { + self::expectException(\InvalidArgumentException::class); + self::expectExceptionMessage('Position must be positive, got: -1.'); + NucleotidePosition::fromOneBased(-1); } /** @return iterable */ @@ -58,6 +65,7 @@ public static function invalidFormats(): iterable { yield ['11:1test']; yield ['chr1:0']; + yield ['chr1:-1']; yield ['chr1:']; yield ['chr1']; } @@ -67,6 +75,6 @@ public static function invalidFormats(): iterable public function testParseRejectsInvalidFormat(string $value): void { self::expectException(\InvalidArgumentException::class); - GenomicPosition::parse($value); + GenomicPosition::parseOneBased($value); } } diff --git a/tests/GenomicRegionTest.php b/tests/GenomicRegionTest.php index feced0c..7ad4e73 100644 --- a/tests/GenomicRegionTest.php +++ b/tests/GenomicRegionTest.php @@ -10,25 +10,25 @@ final class GenomicRegionTest extends TestCase { public function testParseUCSC(): void { - $region = GenomicRegion::parse('chr11:1-2'); + $region = GenomicRegion::parseOneBased('chr11:1-2'); self::assertSame('chr11:1-2', $region->toString(new NamingConvention(NamingConvention::UCSC))); } public function testParseEnsembl(): void { - $region = GenomicRegion::parse('11:1-2'); + $region = GenomicRegion::parseOneBased('11:1-2'); self::assertSame('11:1-2', $region->toString(new NamingConvention(NamingConvention::ENSEMBL))); } public function testParseHGVSg(): void { - $region = GenomicRegion::parse('chr11:g.1-2'); + $region = GenomicRegion::parseOneBased('chr11:g.1-2'); self::assertSame('chr11:1-2', $region->toString(new NamingConvention(NamingConvention::UCSC))); } public function testParseSingleBaseRegion(): void { - $region = GenomicRegion::parse('chr1:100'); + $region = GenomicRegion::parseOneBased('chr1:100'); self::assertSame(100, $region->start); self::assertSame(100, $region->end); self::assertSame(1, $region->length()); @@ -40,26 +40,26 @@ public function testParseOnError(): void $value = '11:1_2'; self::expectException(\InvalidArgumentException::class); self::expectExceptionMessage("Invalid genomic region format: {$value}. Expected format: chr1:123-456."); - GenomicRegion::parse($value); + GenomicRegion::parseOneBased($value); } public function testStartGreaterThanEnd(): void { self::expectException(\InvalidArgumentException::class); self::expectExceptionMessage('End (1) must not be less than start (2).'); - GenomicRegion::parse('11:2-1'); + GenomicRegion::parseOneBased('11:2-1'); } public function testEquals(): void { self::assertTrue( - GenomicRegion::parse('chr11:10-20')->equals(GenomicRegion::parse('11:10-20')) + GenomicRegion::parseOneBased('chr11:10-20')->equals(GenomicRegion::parseOneBased('11:10-20')) ); self::assertFalse( - GenomicRegion::parse('chr11:10-20')->equals(GenomicRegion::parse('chr11:10-21')) + GenomicRegion::parseOneBased('chr11:10-20')->equals(GenomicRegion::parseOneBased('chr11:10-21')) ); self::assertFalse( - GenomicRegion::parse('chr11:10-20')->equals(GenomicRegion::parse('chr12:10-20')) + GenomicRegion::parseOneBased('chr11:10-20')->equals(GenomicRegion::parseOneBased('chr12:10-20')) ); } @@ -75,88 +75,88 @@ public static function lengths(): iterable #[DataProvider('lengths')] public function testLength(string $region, int $expected): void { - self::assertSame($expected, GenomicRegion::parse($region)->length()); + self::assertSame($expected, GenomicRegion::parseOneBased($region)->length()); } public function testContainsPosition(): void { - $region = GenomicRegion::parse('chr11:g.1-20'); - self::assertTrue($region->containsPosition(GenomicPosition::parse('chr11:20'))); - self::assertFalse($region->containsPosition(GenomicPosition::parse('chr11:21'))); + $region = GenomicRegion::parseOneBased('chr11:g.1-20'); + self::assertTrue($region->containsPosition(GenomicPosition::parseOneBased('chr11:20'))); + self::assertFalse($region->containsPosition(GenomicPosition::parseOneBased('chr11:21'))); } public function testContainsPositionAcrossNamingConventions(): void { - $region = GenomicRegion::parse('chr11:1-20'); - self::assertTrue($region->containsPosition(GenomicPosition::parse('11:15'))); + $region = GenomicRegion::parseOneBased('chr11:1-20'); + self::assertTrue($region->containsPosition(GenomicPosition::parseOneBased('11:15'))); } public function testContainsPositionAtStartBoundary(): void { - $region = GenomicRegion::parse('chr11:10-20'); - self::assertTrue($region->containsPosition(GenomicPosition::parse('chr11:10'))); + $region = GenomicRegion::parseOneBased('chr11:10-20'); + self::assertTrue($region->containsPosition(GenomicPosition::parseOneBased('chr11:10'))); } public function testContainsRegion(): void { - $region = GenomicRegion::parse('chr11:g.1-20'); - self::assertTrue($region->containsRegion(GenomicRegion::parse('chr11:19-20'))); - self::assertFalse($region->containsRegion(GenomicRegion::parse('chr11:21-22'))); + $region = GenomicRegion::parseOneBased('chr11:g.1-20'); + self::assertTrue($region->containsRegion(GenomicRegion::parseOneBased('chr11:19-20'))); + self::assertFalse($region->containsRegion(GenomicRegion::parseOneBased('chr11:21-22'))); } public function testIsCoveredBy(): void { - $region = GenomicRegion::parse('chr11:g.20-30'); - self::assertTrue($region->isCoveredBy(GenomicRegion::parse('chr11:g.15-35'))); - self::assertFalse($region->isCoveredBy(GenomicRegion::parse('chr11:g.22-35'))); + $region = GenomicRegion::parseOneBased('chr11:g.20-30'); + self::assertTrue($region->isCoveredBy(GenomicRegion::parseOneBased('chr11:g.15-35'))); + self::assertFalse($region->isCoveredBy(GenomicRegion::parseOneBased('chr11:g.22-35'))); } public function testIsCoveredByIdenticalRegion(): void { - $region = GenomicRegion::parse('chr11:20-30'); - self::assertTrue($region->isCoveredBy(GenomicRegion::parse('chr11:20-30'))); + $region = GenomicRegion::parseOneBased('chr11:20-30'); + self::assertTrue($region->isCoveredBy(GenomicRegion::parseOneBased('chr11:20-30'))); } public function testIntersectsPartialOverlap(): void { - $region = GenomicRegion::parse('chr11:g.20-30'); - self::assertTrue($region->intersects(GenomicRegion::parse('chr11:15-25'))); - self::assertTrue($region->intersects(GenomicRegion::parse('chr11:25-35'))); + $region = GenomicRegion::parseOneBased('chr11:g.20-30'); + self::assertTrue($region->intersects(GenomicRegion::parseOneBased('chr11:15-25'))); + self::assertTrue($region->intersects(GenomicRegion::parseOneBased('chr11:25-35'))); } public function testIntersectsNoOverlap(): void { - $region = GenomicRegion::parse('chr11:g.20-30'); - self::assertFalse($region->intersects(GenomicRegion::parse('chr11:15-19'))); + $region = GenomicRegion::parseOneBased('chr11:g.20-30'); + self::assertFalse($region->intersects(GenomicRegion::parseOneBased('chr11:15-19'))); } public function testIntersectsAdjacentRegion(): void { - $region = GenomicRegion::parse('chr11:10-20'); - self::assertFalse($region->intersects(GenomicRegion::parse('chr11:21-30'))); + $region = GenomicRegion::parseOneBased('chr11:10-20'); + self::assertFalse($region->intersects(GenomicRegion::parseOneBased('chr11:21-30'))); } public function testIntersectsSinglePointOverlap(): void { - $region = GenomicRegion::parse('chr1:10-20'); - self::assertTrue($region->intersects(GenomicRegion::parse('chr1:20-30'))); + $region = GenomicRegion::parseOneBased('chr1:10-20'); + self::assertTrue($region->intersects(GenomicRegion::parseOneBased('chr1:20-30'))); } public function testIntersectionPartial(): void { - $a = GenomicRegion::parse('chr11:10-20'); - $b = GenomicRegion::parse('chr11:15-25'); + $a = GenomicRegion::parseOneBased('chr11:10-20'); + $b = GenomicRegion::parseOneBased('chr11:15-25'); $overlap = $a->intersection($b); self::assertNotNull($overlap); - self::assertTrue($overlap->equals(GenomicRegion::parse('chr11:15-20'))); + self::assertTrue($overlap->equals(GenomicRegion::parseOneBased('chr11:15-20'))); self::assertSame(6, $overlap->length()); } public function testIntersectionFullyContained(): void { - $outer = GenomicRegion::parse('chr11:10-30'); - $inner = GenomicRegion::parse('chr11:15-20'); + $outer = GenomicRegion::parseOneBased('chr11:10-30'); + $inner = GenomicRegion::parseOneBased('chr11:15-20'); $overlap = $outer->intersection($inner); self::assertNotNull($overlap); @@ -165,28 +165,28 @@ public function testIntersectionFullyContained(): void public function testIntersectionSinglePoint(): void { - $a = GenomicRegion::parse('chr1:10-20'); - $b = GenomicRegion::parse('chr1:20-30'); + $a = GenomicRegion::parseOneBased('chr1:10-20'); + $b = GenomicRegion::parseOneBased('chr1:20-30'); $overlap = $a->intersection($b); self::assertNotNull($overlap); - self::assertTrue($overlap->equals(GenomicRegion::parse('chr1:20-20'))); + self::assertTrue($overlap->equals(GenomicRegion::parseOneBased('chr1:20-20'))); self::assertSame(1, $overlap->length()); } public function testIntersectionReturnsNullWhenNoIntersection(): void { - $a = GenomicRegion::parse('chr11:10-20'); - $b = GenomicRegion::parse('chr11:21-30'); + $a = GenomicRegion::parseOneBased('chr11:10-20'); + $b = GenomicRegion::parseOneBased('chr11:21-30'); self::assertNull($a->intersection($b)); } public function testDifferentChromosomesNeverMatch(): void { - $region = GenomicRegion::parse('chr11:1-100'); - $other = GenomicRegion::parse('chr12:10-20'); - self::assertFalse($region->containsPosition(GenomicPosition::parse('chr12:50'))); + $region = GenomicRegion::parseOneBased('chr11:1-100'); + $other = GenomicRegion::parseOneBased('chr12:10-20'); + self::assertFalse($region->containsPosition(GenomicPosition::parseOneBased('chr12:50'))); self::assertFalse($region->containsRegion($other)); self::assertFalse($region->intersects($other)); self::assertNull($region->intersection($other)); @@ -195,13 +195,13 @@ public function testDifferentChromosomesNeverMatch(): void public function testParseRejectsPositionZero(): void { self::expectException(\InvalidArgumentException::class); - GenomicRegion::parse('chr1:0-10'); + GenomicRegion::parseOneBased('chr1:0-10'); } public function testContainsRegionIsInverseOfIsCoveredBy(): void { - $outer = GenomicRegion::parse('chr11:10-30'); - $inner = GenomicRegion::parse('chr11:15-20'); + $outer = GenomicRegion::parseOneBased('chr11:10-30'); + $inner = GenomicRegion::parseOneBased('chr11:15-20'); self::assertSame( $outer->containsRegion($inner), @@ -215,14 +215,91 @@ public function testContainsRegionIsInverseOfIsCoveredBy(): void public function testIsCoveredByDifferentChromosomes(): void { - $region = GenomicRegion::parse('chr11:10-20'); - self::assertFalse($region->isCoveredBy(GenomicRegion::parse('chr12:1-100'))); + $region = GenomicRegion::parseOneBased('chr11:10-20'); + self::assertFalse($region->isCoveredBy(GenomicRegion::parseOneBased('chr12:1-100'))); + } + + public function testFromZeroBasedHalfOpenConvertsToOneBased(): void + { + // BED: chr7 55249070 55249171 (EGFR Exon 19, 0-based half-open) + $region = GenomicRegion::fromZeroBasedHalfOpen('chr7', 55249070, 55249171); + + self::assertSame(55249071, $region->start); + self::assertSame(55249171, $region->end); + self::assertSame(101, $region->length()); + } + + public function testFromZeroBasedHalfOpenSingleBase(): void + { + // BED single base: chr1 99 100 → 1-based chr1:100-100 + $region = GenomicRegion::fromZeroBasedHalfOpen('chr1', 99, 100); + + self::assertSame(100, $region->start); + self::assertSame(100, $region->end); + self::assertSame(1, $region->length()); + } + + public function testToZeroBasedHalfOpenRoundTrips(): void + { + $region = GenomicRegion::fromZeroBasedHalfOpen('chr7', 55249070, 55249171); + + [$chromosome, $start, $end] = $region->toZeroBasedHalfOpen(); + + self::assertSame('7', $chromosome->value()); + self::assertSame(55249070, $start); + self::assertSame(55249171, $end); + } + + public function testFromZeroBasedHalfOpenLengthMatchesBedFormula(): void + { + $bedStart = 1000; + $bedEnd = 2000; + + $region = GenomicRegion::fromZeroBasedHalfOpen('chr1', $bedStart, $bedEnd); + + // BED length = end - start; 1-based length = end - start + 1 + // Both must agree on the actual number of bases + self::assertSame($bedEnd - $bedStart, $region->length()); + } + + public function testGenomicPositions(): void + { + $region = GenomicRegion::parseOneBased('chr11:10-13'); + $positions = $region->genomicPositions(); + + self::assertCount(4, $positions); + self::assertEquals($region->length(), count($positions)); + + self::assertTrue($positions[0]->equals(GenomicPosition::parseOneBased('chr11:10'))); + self::assertTrue($positions[1]->equals(GenomicPosition::parseOneBased('chr11:11'))); + self::assertTrue($positions[2]->equals(GenomicPosition::parseOneBased('chr11:12'))); + self::assertTrue($positions[3]->equals(GenomicPosition::parseOneBased('chr11:13'))); + } + + public function testGenomicPositionsSingleBase(): void + { + $region = GenomicRegion::parseOneBased('chr1:42'); + $positions = $region->genomicPositions(); + + self::assertCount(1, $positions); + self::assertTrue($positions[0]->equals(GenomicPosition::parseOneBased('chr1:42'))); + } + + public function testGenomicPositionsFromZeroBasedHalfOpen(): void + { + $region = GenomicRegion::fromZeroBasedHalfOpen('chr1', 99, 102); + $positions = $region->genomicPositions(); + + self::assertCount(3, $positions); + self::assertTrue($positions[0]->equals(GenomicPosition::parseOneBased('chr1:100'))); + self::assertTrue($positions[1]->equals(GenomicPosition::parseOneBased('chr1:101'))); + self::assertTrue($positions[2]->equals(GenomicPosition::parseOneBased('chr1:102'))); } public function testIntersectionIsCommutative(): void { - $a = GenomicRegion::parse('chr11:10-20'); - $b = GenomicRegion::parse('chr11:15-25'); + $a = GenomicRegion::parseOneBased('chr11:10-20'); + $b = GenomicRegion::parseOneBased('chr11:15-25'); $ab = $a->intersection($b); $ba = $b->intersection($a);