From d03819d795c6928c82c71e2530ad9221e47858e5 Mon Sep 17 00:00:00 2001 From: Simon Bigelmayr Date: Wed, 11 Mar 2026 14:31:27 +0100 Subject: [PATCH 1/4] fix: extract flowcell ID, handle backslash paths, support 6-digit dates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract actual flowcell ID using nemo's regex (000000000-AGKG7 → AGKG7) - Normalize backslashes so basename() works with Windows paths on Linux - Support 6-digit dates (ymd format, e.g. 151231) alongside 8-digit (Ymd) - Use Carbon instead of CarbonImmutable (aligns with #80) - Remove unused toString() and rawFlowcellSegment (YAGNI) - Add comprehensive test coverage for all Illumina run folder formats Co-Authored-By: Claude Opus 4.6 --- src/IlluminaRunFolder.php | 80 +++++++++++++++++++++++++-------- tests/IlluminaRunFolderTest.php | 66 +++++++++++++++++++++++---- 2 files changed, 120 insertions(+), 26 deletions(-) diff --git a/src/IlluminaRunFolder.php b/src/IlluminaRunFolder.php index 6e9c73a..6f5e0f7 100644 --- a/src/IlluminaRunFolder.php +++ b/src/IlluminaRunFolder.php @@ -2,21 +2,40 @@ namespace MLL\Utils; -use Carbon\CarbonImmutable; +use Carbon\Carbon; use function Safe\preg_match; +/** + * Parses Illumina sequencer run folder names into structured parts. + * + * Folder names follow the pattern: YYYYMMDD_InstrumentID_RunNumber_FlowcellSegment + * + * Examples: + * - MiSeq i100: 20260224_SH01038_0011_ASC2168863-SC3 + * - MiSeq: 151231_M01261_0163_000000000-AGKG7 + * - NextSeq: 160205_NB501352_0003_AH7LFFAFXX + * - MiSeq Nano: 160315_M01111_0231_000000000-D0WDA + * - Broken RFID: 160108_M01111_0222_AGKKL + */ class IlluminaRunFolder { - public CarbonImmutable $date; + public Carbon $date; public string $instrumentID; public int $runNumber; + /** + * The extracted flowcell ID (e.g., AGKG7, ASC2168863-SC3, AH7LFFAFXX). + * + * Strips the optional numeric prefix from the raw segment: + * - 000000000-AGKG7 → AGKG7 + * - ASC2168863-SC3 → ASC2168863-SC3 (no prefix to strip) + */ public string $flowcellID; - public function __construct(CarbonImmutable $date, string $instrumentID, int $runNumber, string $flowcellID) + public function __construct(Carbon $date, string $instrumentID, int $runNumber, string $flowcellID) { $this->date = $date; $this->instrumentID = $instrumentID; @@ -24,40 +43,65 @@ public function __construct(CarbonImmutable $date, string $instrumentID, int $ru $this->flowcellID = $flowcellID; } - /** @example IlluminaRunFolder::parse('/path/to/20260205_SH01038_0007_ASC2139476-SC3') */ + /** + * Parse a run folder name or path into its structured parts. + * + * Accepts both forward and backslash paths. + * + * @example IlluminaRunFolder::parse('miseq_active\260310_M02074_1219_000000000-MB4RJ') + * @example IlluminaRunFolder::parse('/data/sequencing/20260205_SH01038_0007_ASC2139476-SC3') + */ public static function parse(string $runFolder): self { - $folderName = basename($runFolder); + // Normalize backslashes so basename() works on Linux with Windows-style paths + $normalized = str_replace('\\', '/', $runFolder); + $folderName = basename($normalized); + $parts = explode('_', $folderName, 4); if (count($parts) !== 4) { throw new \InvalidArgumentException("Invalid run folder format: {$runFolder}. Expected format: YYYYMMDD_InstrumentID_RunNumber_FlowcellID."); } - [$dateString, $instrumentID, $runNumberString, $flowcellID] = $parts; + [$dateString, $instrumentID, $runNumberString, $flowcellSegment] = $parts; - if (preg_match('/^\d{8}$/', $dateString) === 0) { - throw new \InvalidArgumentException("Invalid date in run folder: {$dateString}. Expected format: YYYYMMDD."); + if (preg_match('/^\d{6,8}$/', $dateString) === 0) { + throw new \InvalidArgumentException("Invalid date in run folder: {$dateString}. Expected 6 or 8 digit date."); } - $date = CarbonImmutable::createFromFormat('!Ymd', $dateString); - if (! $date instanceof \Carbon\CarbonImmutable) { - throw new \InvalidArgumentException("Invalid date in run folder: {$dateString}. Expected format: YYYYMMDD."); + $format = strlen($dateString) === 8 ? '!Ymd' : '!ymd'; + $date = Carbon::createFromFormat($format, $dateString); + if (! $date instanceof Carbon) { + throw new \InvalidArgumentException("Invalid date in run folder: {$dateString}."); } if ($runNumberString === '' || ! ctype_digit($runNumberString)) { throw new \InvalidArgumentException("Invalid run number in run folder: {$runNumberString}. Expected a numeric value."); } + $flowcellID = self::extractFlowcellID($flowcellSegment); + return new self($date, $instrumentID, (int) $runNumberString, $flowcellID); } - public function toString(): string + /** + * Extract the actual flowcell ID from the raw segment. + * + * Illumina run folders encode the flowcell ID in the last segment, optionally + * prefixed with zeros and a dash (broken RFID readers on older MiSeqs): + * - 000000000-AGKG7 → AGKG7 + * - 000000000-D0WDA → D0WDA + * - ASC2168863-SC3 → ASC2168863-SC3 + * - AH7LFFAFXX → AH7LFFAFXX + * - AGKKL → AGKKL + * + * @see https://gitlab.mll/nemo/nemo/-/blob/master/scripts/illumina/pipeline/ill-ended.php (flowcell regex) + */ + private static function extractFlowcellID(string $rawSegment): string { - return implode('_', [ - $this->date->format('Ymd'), - $this->instrumentID, - str_pad((string) $this->runNumber, 4, '0', STR_PAD_LEFT), - $this->flowcellID, - ]); + if (preg_match('/\d*-?([ABCDG].+)$/', $rawSegment, $matches) !== 1) { + throw new \InvalidArgumentException("Cannot extract flowcell ID from: {$rawSegment}"); + } + + return $matches[1]; } } diff --git a/tests/IlluminaRunFolderTest.php b/tests/IlluminaRunFolderTest.php index 03a80bf..1b6c22d 100644 --- a/tests/IlluminaRunFolderTest.php +++ b/tests/IlluminaRunFolderTest.php @@ -1,12 +1,14 @@ instrumentID); self::assertSame(7, $folder->runNumber); self::assertSame('ASC2139476-SC3', $folder->flowcellID); + + } + + public function testParseMiSeqWithZeroPrefixedFlowcell(): void + { + $folder = IlluminaRunFolder::parse('151231_M01261_0163_000000000-AGKG7'); + + self::assertSame('2015-12-31', $folder->date->format('Y-m-d')); + self::assertSame('M01261', $folder->instrumentID); + self::assertSame(163, $folder->runNumber); + self::assertSame('AGKG7', $folder->flowcellID); + + } + + public function testParseNextSeq(): void + { + $folder = IlluminaRunFolder::parse('160205_NB501352_0003_AH7LFFAFXX'); + + self::assertSame('2016-02-05', $folder->date->format('Y-m-d')); + self::assertSame('NB501352', $folder->instrumentID); + self::assertSame(3, $folder->runNumber); + self::assertSame('AH7LFFAFXX', $folder->flowcellID); + + } + + public function testParseMiSeqNanoFlowcell(): void + { + $folder = IlluminaRunFolder::parse('160315_M01111_0231_000000000-D0WDA'); + + self::assertSame('D0WDA', $folder->flowcellID); + + } + + public function testParseBrokenRfidShortFlowcell(): void + { + $folder = IlluminaRunFolder::parse('160108_M01111_0222_AGKKL'); + + self::assertSame('AGKKL', $folder->flowcellID); + } - public function testParseFromPath(): void + public function testParseFromForwardSlashPath(): void { $folder = IlluminaRunFolder::parse('/data/sequencing/20260205_SH01038_0007_ASC2139476-SC3'); - self::assertSame('2026-02-05', $folder->date->format('Y-m-d')); self::assertSame('SH01038', $folder->instrumentID); - self::assertSame(7, $folder->runNumber); self::assertSame('ASC2139476-SC3', $folder->flowcellID); } - public function testToString(): void + public function testParseFromBackslashPath(): void { - $folder = IlluminaRunFolder::parse('20260205_SH01038_0007_ASC2139476-SC3'); + $folder = IlluminaRunFolder::parse('miseq_active\260310_M02074_1219_000000000-MB4RJ'); + + self::assertSame('2026-03-10', $folder->date->format('Y-m-d')); + self::assertSame('M02074', $folder->instrumentID); + self::assertSame(1219, $folder->runNumber); + self::assertSame('B4RJ', $folder->flowcellID); - self::assertSame('20260205_SH01038_0007_ASC2139476-SC3', $folder->toString()); } public function testParseRejectsInvalidPartCount(): void @@ -43,7 +86,7 @@ public function testParseRejectsInvalidPartCount(): void public function testParseRejectsInvalidDate(): void { self::expectException(\InvalidArgumentException::class); - self::expectExceptionMessage('Expected format: YYYYMMDD.'); + self::expectExceptionMessage('Expected 6 or 8 digit date.'); IlluminaRunFolder::parse('not-a-date_SH01038_0007_ASC2139476-SC3'); } @@ -62,4 +105,11 @@ public function testParseRejectsInvalidRunNumber(string $value): void self::expectExceptionMessage('Expected a numeric value.'); IlluminaRunFolder::parse($value); } + + public function testParseRejectsUnparsableFlowcellID(): void + { + self::expectException(\InvalidArgumentException::class); + self::expectExceptionMessage('Cannot extract flowcell ID from: 12345'); + IlluminaRunFolder::parse('20260205_SH01038_0007_12345'); + } } From 8f82bc4ae396843cde7c27d301c3ebf78c6669cb Mon Sep 17 00:00:00 2001 From: simbig <26680884+simbig@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:33:48 +0000 Subject: [PATCH 2/4] Apply php-cs-fixer changes --- tests/IlluminaRunFolderTest.php | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/IlluminaRunFolderTest.php b/tests/IlluminaRunFolderTest.php index 1b6c22d..0767cec 100644 --- a/tests/IlluminaRunFolderTest.php +++ b/tests/IlluminaRunFolderTest.php @@ -16,7 +16,6 @@ public function testParseMiSeqI100(): void self::assertSame('SH01038', $folder->instrumentID); self::assertSame(7, $folder->runNumber); self::assertSame('ASC2139476-SC3', $folder->flowcellID); - } public function testParseMiSeqWithZeroPrefixedFlowcell(): void @@ -27,7 +26,6 @@ public function testParseMiSeqWithZeroPrefixedFlowcell(): void self::assertSame('M01261', $folder->instrumentID); self::assertSame(163, $folder->runNumber); self::assertSame('AGKG7', $folder->flowcellID); - } public function testParseNextSeq(): void @@ -38,7 +36,6 @@ public function testParseNextSeq(): void self::assertSame('NB501352', $folder->instrumentID); self::assertSame(3, $folder->runNumber); self::assertSame('AH7LFFAFXX', $folder->flowcellID); - } public function testParseMiSeqNanoFlowcell(): void @@ -46,7 +43,6 @@ public function testParseMiSeqNanoFlowcell(): void $folder = IlluminaRunFolder::parse('160315_M01111_0231_000000000-D0WDA'); self::assertSame('D0WDA', $folder->flowcellID); - } public function testParseBrokenRfidShortFlowcell(): void @@ -54,7 +50,6 @@ public function testParseBrokenRfidShortFlowcell(): void $folder = IlluminaRunFolder::parse('160108_M01111_0222_AGKKL'); self::assertSame('AGKKL', $folder->flowcellID); - } public function testParseFromForwardSlashPath(): void @@ -73,7 +68,6 @@ public function testParseFromBackslashPath(): void self::assertSame('M02074', $folder->instrumentID); self::assertSame(1219, $folder->runNumber); self::assertSame('B4RJ', $folder->flowcellID); - } public function testParseRejectsInvalidPartCount(): void From 4af0d776753531f60bf7ff07c7abbc4257539dab Mon Sep 17 00:00:00 2001 From: Simon Bigelmayr Date: Wed, 11 Mar 2026 14:39:27 +0100 Subject: [PATCH 3/4] refactor: simplify verbose comments in IlluminaRunFolder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests already document all format variants — no need to repeat examples in docblocks. Remove internal GitLab reference. Co-Authored-By: Claude Opus 4.6 --- src/IlluminaRunFolder.php | 59 ++++++--------------------------- tests/IlluminaRunFolderTest.php | 12 +++---- 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/src/IlluminaRunFolder.php b/src/IlluminaRunFolder.php index 6f5e0f7..37beb14 100644 --- a/src/IlluminaRunFolder.php +++ b/src/IlluminaRunFolder.php @@ -6,33 +6,18 @@ use function Safe\preg_match; -/** - * Parses Illumina sequencer run folder names into structured parts. - * - * Folder names follow the pattern: YYYYMMDD_InstrumentID_RunNumber_FlowcellSegment - * - * Examples: - * - MiSeq i100: 20260224_SH01038_0011_ASC2168863-SC3 - * - MiSeq: 151231_M01261_0163_000000000-AGKG7 - * - NextSeq: 160205_NB501352_0003_AH7LFFAFXX - * - MiSeq Nano: 160315_M01111_0231_000000000-D0WDA - * - Broken RFID: 160108_M01111_0222_AGKKL - */ +/** Parses Illumina sequencer run folder names (YYYYMMDD_InstrumentID_RunNumber_FlowcellSegment). */ class IlluminaRunFolder { + private const FLOWCELL_ID_PATTERN = '/\d*-?([A-Z].+)$/'; + public Carbon $date; public string $instrumentID; public int $runNumber; - /** - * The extracted flowcell ID (e.g., AGKG7, ASC2168863-SC3, AH7LFFAFXX). - * - * Strips the optional numeric prefix from the raw segment: - * - 000000000-AGKG7 → AGKG7 - * - ASC2168863-SC3 → ASC2168863-SC3 (no prefix to strip) - */ + /** Strips optional zero-prefix from raw segment: 000000000-AGKG7 → AGKG7. */ public string $flowcellID; public function __construct(Carbon $date, string $instrumentID, int $runNumber, string $flowcellID) @@ -44,12 +29,10 @@ public function __construct(Carbon $date, string $instrumentID, int $runNumber, } /** - * Parse a run folder name or path into its structured parts. + * Accepts both bare folder names and paths with forward or backslashes. * - * Accepts both forward and backslash paths. - * - * @example IlluminaRunFolder::parse('miseq_active\260310_M02074_1219_000000000-MB4RJ') - * @example IlluminaRunFolder::parse('/data/sequencing/20260205_SH01038_0007_ASC2139476-SC3') + * @example IlluminaRunFolder::parse('foo\bar\260310_M02074_1219_000000000-MB4RJ') + * @example IlluminaRunFolder::parse('/path/to/20260205_SH01038_0007_ASC2139476-SC3') */ public static function parse(string $runFolder): self { @@ -64,7 +47,7 @@ public static function parse(string $runFolder): self [$dateString, $instrumentID, $runNumberString, $flowcellSegment] = $parts; - if (preg_match('/^\d{6,8}$/', $dateString) === 0) { + if (preg_match('/^(\d{6}|\d{8})$/', $dateString) === 0) { throw new \InvalidArgumentException("Invalid date in run folder: {$dateString}. Expected 6 or 8 digit date."); } @@ -78,30 +61,10 @@ public static function parse(string $runFolder): self throw new \InvalidArgumentException("Invalid run number in run folder: {$runNumberString}. Expected a numeric value."); } - $flowcellID = self::extractFlowcellID($flowcellSegment); - - return new self($date, $instrumentID, (int) $runNumberString, $flowcellID); - } - - /** - * Extract the actual flowcell ID from the raw segment. - * - * Illumina run folders encode the flowcell ID in the last segment, optionally - * prefixed with zeros and a dash (broken RFID readers on older MiSeqs): - * - 000000000-AGKG7 → AGKG7 - * - 000000000-D0WDA → D0WDA - * - ASC2168863-SC3 → ASC2168863-SC3 - * - AH7LFFAFXX → AH7LFFAFXX - * - AGKKL → AGKKL - * - * @see https://gitlab.mll/nemo/nemo/-/blob/master/scripts/illumina/pipeline/ill-ended.php (flowcell regex) - */ - private static function extractFlowcellID(string $rawSegment): string - { - if (preg_match('/\d*-?([ABCDG].+)$/', $rawSegment, $matches) !== 1) { - throw new \InvalidArgumentException("Cannot extract flowcell ID from: {$rawSegment}"); + if (preg_match(self::FLOWCELL_ID_PATTERN, $flowcellSegment, $matches) !== 1) { + throw new \InvalidArgumentException("Cannot extract flowcell ID from: {$flowcellSegment}"); } - return $matches[1]; + return new self($date, $instrumentID, (int) $runNumberString, $matches[1]); } } diff --git a/tests/IlluminaRunFolderTest.php b/tests/IlluminaRunFolderTest.php index 0767cec..20d3d85 100644 --- a/tests/IlluminaRunFolderTest.php +++ b/tests/IlluminaRunFolderTest.php @@ -45,16 +45,16 @@ public function testParseMiSeqNanoFlowcell(): void self::assertSame('D0WDA', $folder->flowcellID); } - public function testParseBrokenRfidShortFlowcell(): void + public function testParseFlowcellStartingWithL(): void { - $folder = IlluminaRunFolder::parse('160108_M01111_0222_AGKKL'); + $folder = IlluminaRunFolder::parse('231013_M02074_0918_000000000-L6G7G'); - self::assertSame('AGKKL', $folder->flowcellID); + self::assertSame('L6G7G', $folder->flowcellID); } public function testParseFromForwardSlashPath(): void { - $folder = IlluminaRunFolder::parse('/data/sequencing/20260205_SH01038_0007_ASC2139476-SC3'); + $folder = IlluminaRunFolder::parse('/path/to/20260205_SH01038_0007_ASC2139476-SC3'); self::assertSame('SH01038', $folder->instrumentID); self::assertSame('ASC2139476-SC3', $folder->flowcellID); @@ -62,12 +62,12 @@ public function testParseFromForwardSlashPath(): void public function testParseFromBackslashPath(): void { - $folder = IlluminaRunFolder::parse('miseq_active\260310_M02074_1219_000000000-MB4RJ'); + $folder = IlluminaRunFolder::parse('foo\bar\260310_M02074_1219_000000000-MB4RJ'); self::assertSame('2026-03-10', $folder->date->format('Y-m-d')); self::assertSame('M02074', $folder->instrumentID); self::assertSame(1219, $folder->runNumber); - self::assertSame('B4RJ', $folder->flowcellID); + self::assertSame('MB4RJ', $folder->flowcellID); } public function testParseRejectsInvalidPartCount(): void From c40426d8d8de639b61fa03c0e40317223e36e694 Mon Sep 17 00:00:00 2001 From: simbig <26680884+simbig@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:06:52 +0000 Subject: [PATCH 4/4] Apply php-cs-fixer changes --- src/IlluminaRunFolder.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/IlluminaRunFolder.php b/src/IlluminaRunFolder.php index 37beb14..6875d62 100644 --- a/src/IlluminaRunFolder.php +++ b/src/IlluminaRunFolder.php @@ -6,7 +6,9 @@ use function Safe\preg_match; -/** Parses Illumina sequencer run folder names (YYYYMMDD_InstrumentID_RunNumber_FlowcellSegment). */ +/** + * Parses Illumina sequencer run folder names (YYYYMMDD_InstrumentID_RunNumber_FlowcellSegment). + */ class IlluminaRunFolder { private const FLOWCELL_ID_PATTERN = '/\d*-?([A-Z].+)$/';