From eaff62f4595144ef850ec376e66c96113a229667 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 7 Feb 2026 08:51:37 -0800 Subject: [PATCH 1/2] Migrate NT sequence files from flat dir to hashed structure --- .../sequenceanalysis/RefNtSequenceModel.java | 57 +++++++--------- .../SequenceAnalysis-12.331-12.332.sql | 1 + .../SequenceAnalysis-12.331-12.332.sql | 1 + .../SequenceAnalysisMaintenanceTask.java | 52 ++++++++++----- .../SequenceAnalysisModule.java | 2 +- .../SequenceAnalysisUpgradeCode.java | 65 +++++++++++++++++++ 6 files changed, 126 insertions(+), 52 deletions(-) create mode 100644 SequenceAnalysis/resources/schemas/dbscripts/postgresql/SequenceAnalysis-12.331-12.332.sql create mode 100644 SequenceAnalysis/resources/schemas/dbscripts/sqlserver/SequenceAnalysis-12.331-12.332.sql diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java index 4d7afe5c6..3bdc7fbb3 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java @@ -18,7 +18,6 @@ import htsjdk.samtools.util.StringUtil; import org.apache.commons.io.IOUtils; import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.LogManager; import org.jetbrains.annotations.Nullable; import org.labkey.api.data.Container; import org.labkey.api.data.ContainerManager; @@ -32,8 +31,11 @@ import org.labkey.api.exp.api.ExpData; import org.labkey.api.exp.api.ExperimentService; import org.labkey.api.files.FileContentService; +import org.labkey.api.security.Crypt; import org.labkey.api.security.User; +import org.labkey.api.util.FileUtil; import org.labkey.api.util.MemTracker; +import org.labkey.api.util.logging.LogHelper; import org.labkey.api.writer.PrintWriters; import java.io.File; @@ -55,7 +57,9 @@ */ public class RefNtSequenceModel implements Serializable { - private static final Logger _log = LogManager.getLogger(RefNtSequenceModel.class); + private static final Logger _log = LogHelper.getLogger(RefNtSequenceModel.class, "Messages related to Reference NT Sequences"); + + public static String BASE_DIRNAME = ".sequences"; private int _rowid; private String _name; @@ -414,7 +418,7 @@ public byte[] getSequenceBases() public void createFileForSequence(User u, String sequence, @Nullable File outDir) throws IOException { - File output = getExpectedSequenceFile(outDir); + File output = getExpectedSequenceFile(); if (output.exists()) { output.delete(); @@ -439,9 +443,9 @@ public void createFileForSequence(User u, String sequence, @Nullable File outDir Table.update(u, ti, this, _rowid); } - private File getExpectedSequenceFile(@Nullable File outDir) throws IllegalArgumentException + public File getExpectedSequenceFile() throws IllegalArgumentException { - return new File(getSequenceDir(true, outDir), _rowid + ".txt.gz"); + return FileUtil.appendName(getHashedDir(true), _rowid + ".txt.gz"); } private Container getLabKeyContainer() @@ -455,20 +459,9 @@ private Container getLabKeyContainer() return c; } - private File getSequenceDir(boolean create, @Nullable File outDir) throws IllegalArgumentException + private File getBaseSequenceDir() throws IllegalArgumentException { Container c = getLabKeyContainer(); - File ret = outDir == null ? getReferenceSequenceDir(c) : outDir; - if (create && !ret.exists()) - { - ret.mkdirs(); - } - - return ret; - } - - private File getReferenceSequenceDir(Container c) throws IllegalArgumentException - { FileContentService fileService = FileContentService.get(); File root = fileService == null ? null : fileService.getFileRoot(c, FileContentService.ContentType.files); if (root == null) @@ -476,12 +469,7 @@ private File getReferenceSequenceDir(Container c) throws IllegalArgumentExceptio throw new IllegalArgumentException("File root not defined for container: " + c.getPath()); } - return new File(root, ".sequences"); - } - - public void writeSequence(Writer writer, int lineLength) throws IOException - { - writeSequence(writer, lineLength, null, null); + return FileUtil.appendName(root, BASE_DIRNAME); } public void writeSequence(Writer writer, int lineLength, Integer start, Integer end) throws IOException @@ -548,20 +536,23 @@ public void setSeqLength(Integer seqLength) _seqLength = seqLength; } - @Nullable - public File getOffsetsFile() + private File getHashedDir(boolean create) { - if (getSequenceFile() == null) - { - return null; - } + File baseDir = getBaseSequenceDir(); + String digest = Crypt.MD5.digest(String.valueOf(getRowid())); + + baseDir = FileUtil.appendName(baseDir, digest.substring(0,4)); + baseDir = FileUtil.appendName(baseDir, digest.substring(4,8)); + baseDir = FileUtil.appendName(baseDir, digest.substring(8,12)); + baseDir = FileUtil.appendName(baseDir, digest.substring(12,20)); + baseDir = FileUtil.appendName(baseDir, digest.substring(20,28)); + baseDir = FileUtil.appendName(baseDir, digest.substring(28,32)); - ExpData d = ExperimentService.get().getExpData(_sequenceFile); - if (d == null || d.getFile() == null) + if (create) { - return null; + baseDir.mkdirs(); } - return new File(d.getFile().getParentFile(), getRowid() + "_offsets.txt"); + return baseDir; } } diff --git a/SequenceAnalysis/resources/schemas/dbscripts/postgresql/SequenceAnalysis-12.331-12.332.sql b/SequenceAnalysis/resources/schemas/dbscripts/postgresql/SequenceAnalysis-12.331-12.332.sql new file mode 100644 index 000000000..2c2517351 --- /dev/null +++ b/SequenceAnalysis/resources/schemas/dbscripts/postgresql/SequenceAnalysis-12.331-12.332.sql @@ -0,0 +1 @@ +SELECT core.executeJavaUpgradeCode('migrateSequenceDirs'); \ No newline at end of file diff --git a/SequenceAnalysis/resources/schemas/dbscripts/sqlserver/SequenceAnalysis-12.331-12.332.sql b/SequenceAnalysis/resources/schemas/dbscripts/sqlserver/SequenceAnalysis-12.331-12.332.sql new file mode 100644 index 000000000..b24244d15 --- /dev/null +++ b/SequenceAnalysis/resources/schemas/dbscripts/sqlserver/SequenceAnalysis-12.331-12.332.sql @@ -0,0 +1 @@ +EXEC core.executeJavaUpgradeCode 'migrateSequenceDirs'; \ No newline at end of file diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java index 6ea1c01a9..a87859e5e 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java @@ -46,6 +46,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -301,10 +302,10 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel { //first sequences log.debug("Inspecting sequences"); - File sequenceDir = new File(root.getRootPath(), ".sequences"); + File sequenceDir = FileUtil.appendName(root.getRootPath(), ".sequences"); TableInfo tableRefNtSequences = SequenceAnalysisSchema.getTable(SequenceAnalysisSchema.TABLE_REF_NT_SEQUENCES); TableSelector ntTs = new TableSelector(tableRefNtSequences, new SimpleFilter(FieldKey.fromString("container"), c.getId()), null); - final Set expectedSequences = new HashSet<>(10000, 1000); + final Set expectedSequences = new HashSet<>(10000, 1000); ntTs.forEach(RefNtSequenceModel.class, m -> { if (m.getSequenceFile() == null || m.getSequenceFile() == 0) { @@ -319,26 +320,23 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel return; } - if (!d.getFile().exists()) - { - log.error("expected sequence file does not exist for sequence: " + m.getRowid() + " " + m.getName() + ", expected: " + d.getFile().getPath()); - return; - } - if (d.getFile().getAbsolutePath().toLowerCase().startsWith(sequenceDir.getAbsolutePath().toLowerCase())) { - expectedSequences.add(d.getFile().getName()); + expectedSequences.add(d.getFile()); } }); if (sequenceDir.exists()) { - for (File child : sequenceDir.listFiles()) + inspectSequenceDir(sequenceDir, expectedSequences, log); + } + + if (!expectedSequences.isEmpty()) + { + for (File missing : expectedSequences) { - if (!expectedSequences.contains(child.getName())) - { - deleteFile(child, log); - } + log.error("expected sequence file does not exist: " + missing.getPath()); + return; } } @@ -446,12 +444,12 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel continue; } - deleteFile(new File(child, fileName), log); + deleteFile(FileUtil.appendName(child, fileName), log); } } //check/verify tracks - File trackDir = new File(child, "tracks"); + File trackDir = FileUtil.appendName(child, "tracks"); if (trackDir.exists()) { Set expectedTracks = new HashSet<>(); @@ -486,7 +484,7 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel } //check/verify chainFiles - File chainDir = new File(child, "chainFiles"); + File chainDir = FileUtil.appendName(child, "chainFiles"); if (chainDir.exists()) { Set expectedChains = new HashSet<>(); @@ -555,7 +553,7 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel } } - File sequenceOutputsDir = new File(root.getRootPath(), "sequenceOutputs"); + File sequenceOutputsDir = FileUtil.appendName(root.getRootPath(), "sequenceOutputs"); if (sequenceOutputsDir.exists()) { for (File child : sequenceOutputsDir.listFiles()) @@ -576,6 +574,24 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel } } + private void inspectSequenceDir(File sequenceDir, Set expectedSequences, Logger log) throws IOException + { + for (File child : Objects.requireNonNull(sequenceDir.listFiles())) + { + if (child.isDirectory()) + { + inspectSequenceDir(child, expectedSequences, log); + } + else + { + if (!expectedSequences.remove(child)) + { + deleteFile(child, log); + } + } + } + } + private void deleteFile(File f, Logger log) throws IOException { log.info("deleting sequence file: " + f.getPath()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index a4044bcae..60186f5ee 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -209,7 +209,7 @@ public String getName() @Override public Double getSchemaVersion() { - return 12.331; + return 12.332; } @Override diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisUpgradeCode.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisUpgradeCode.java index 858684d11..40b221c70 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisUpgradeCode.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisUpgradeCode.java @@ -229,4 +229,69 @@ public void updateBarcodeRC(final ModuleContext moduleContext) }); } } + + /** called at 12.331-12.332*/ + @SuppressWarnings({"UnusedDeclaration"}) + @DeferredUpgrade + public void migrateSequenceDirs(final ModuleContext moduleContext) + { + try + { + TableInfo ti = SequenceAnalysisSchema.getTable(SequenceAnalysisSchema.TABLE_REF_NT_SEQUENCES); + TableSelector ts = new TableSelector(ti); + List nts = ts.getArrayList(RefNtSequenceModel.class); + _log.info(nts.size() + " total sequences to migrate"); + int processed = 0; + for (RefNtSequenceModel nt : nts) + { + processed++; + + if (processed % 1000 == 0) + { + _log.info("{} of {} sequence files migrated", processed, nts.size()); + } + + ExpData legacyExpData = ExperimentService.get().getExpData(nt.getSequenceFile()); + if (legacyExpData == null) + { + _log.error("Missing ExpData for NT sequence: {}", nt.getSequenceFile()); + continue; + } + + File legacyFile = legacyExpData.getFile(); + if (!legacyFile.exists()) + { + _log.error("Missing file for NT sequence: {}", legacyFile.getPath()); + continue; + } + + if (!RefNtSequenceModel.BASE_DIRNAME.equals(legacyFile.getParentFile().getName())) + { + _log.error("Sequence appears to have already been migrated, this might indicate a retry after a failed move: {}", legacyFile.getPath()); + continue; + } + + File newLocation = nt.getExpectedSequenceFile(); + if (!newLocation.getParentFile().exists()) + { + newLocation.getParentFile().mkdirs(); + } + + if (newLocation.exists()) + { + _log.error("Target location for migrated sequence file exists, this might indicate a retry after a filed move: {}", newLocation.getPath()); + continue; + } + + FileUtils.copyFile(legacyFile, newLocation); + legacyExpData.setDataFileURI(newLocation.toURI()); + legacyExpData.save(moduleContext.getUpgradeUser()); + legacyFile.delete(); + } + } + catch (Exception e) + { + _log.error("Error upgrading sequenceanalysis module", e); + } + } } From 058047344d55fa3131a0ec2e6c6ca5101df96318 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 7 Feb 2026 08:59:29 -0800 Subject: [PATCH 2/2] Restore offset --- .../sequenceanalysis/RefNtSequenceModel.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java index 3bdc7fbb3..71d236c70 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java @@ -536,6 +536,23 @@ public void setSeqLength(Integer seqLength) _seqLength = seqLength; } + @Nullable + public File getOffsetsFile() + { + if (getSequenceFile() == null) + { + return null; + } + + ExpData d = ExperimentService.get().getExpData(_sequenceFile); + if (d == null || d.getFile() == null) + { + return null; + } + + return FileUtil.appendName(d.getFile().getParentFile(), getRowid() + "_offsets.txt"); + } + private File getHashedDir(boolean create) { File baseDir = getBaseSequenceDir();