Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions data_management/services/dlu_filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,16 @@ def rename_and_move_files(self, file_list: list[DLUFile], slide_name_map, packag

source_package_directory = self.globus_data_directory + '/' + self.globus_dir_prefix + package_id
for file in file_list:
dest_file = os.path.join(dest_package_directory, slide_name_map[file.name])
logger.info("Copying file " + os.path.join(source_package_directory, file.name) + " to "
+ os.path.join(dest_package_directory, slide_name_map[file.name]))
shutil.copy(os.path.join(source_package_directory, file.name),
dest_file)
file = DLUFile(name=slide_name_map[file.name], path=dest_package_directory,
checksum=calculate_checksum(dest_file), size=os.path.getsize(dest_file))
dest_file_name = slide_name_map[file.name]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Add KeyError protection for missing slide name mappings.

Line 129 accesses slide_name_map[file.name] without checking if the key exists. If a file name is not present in the mapping, this will raise a KeyError and crash the process, potentially leaving the source directory in a partially renamed state.

🛡️ Proposed fix with KeyError handling
         for file in file_list:
-            dest_file_name = slide_name_map[file.name]
+            if file.name not in slide_name_map:
+                raise ValueError(f"File '{file.name}' not found in slide_name_map")
+            dest_file_name = slide_name_map[file.name]
             renamed_src_path = os.path.join(source_package_directory, dest_file_name)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
dest_file_name = slide_name_map[file.name]
if file.name not in slide_name_map:
raise ValueError(f"File '{file.name}' not found in slide_name_map")
dest_file_name = slide_name_map[file.name]

renamed_src_path = os.path.join(source_package_directory, dest_file_name)
dest_path = os.path.join(dest_package_directory, dest_file_name)

logger.info("Renaming file " + os.path.join(source_package_directory, file.name) + " to " + renamed_src_path)
os.rename(os.path.join(source_package_directory, file.name), renamed_src_path)
logger.info("Copying file " + renamed_src_path + " to " + dest_path)
shutil.copy(renamed_src_path, dest_path)
Comment on lines +133 to +136

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | 🏗️ Heavy lift

Critical: In-place source mutation creates data loss risk.

The code renames files directly in the source Globus directory (line 134), then copies them (line 136). This approach is risky because:

  1. Data loss on failure: If shutil.copy() fails after os.rename() succeeds, the original file is permanently renamed and not copied, leaving the source in an inconsistent state
  2. No rollback: If the process crashes partway through, previously renamed files cannot be easily restored
  3. Concurrent access issues: Other processes reading from the Globus directory may fail if files are renamed mid-operation
  4. Source preservation: The source directory should typically remain unchanged until the entire operation succeeds

The standard pattern is to copy directly from source to destination with the new name, leaving the source intact until success is confirmed.

♻️ Proposed fix: Copy directly without mutating source
         for file in file_list:
+            if file.name not in slide_name_map:
+                raise ValueError(f"File '{file.name}' not found in slide_name_map")
             dest_file_name = slide_name_map[file.name]
-            renamed_src_path = os.path.join(source_package_directory, dest_file_name)
+            src_path = os.path.join(source_package_directory, file.name)
             dest_path = os.path.join(dest_package_directory, dest_file_name)
 
-            logger.info("Renaming file " + os.path.join(source_package_directory, file.name) + " to " + renamed_src_path)
-            os.rename(os.path.join(source_package_directory, file.name), renamed_src_path)
-            logger.info("Copying file " + renamed_src_path + " to " + dest_path)
-            shutil.copy(renamed_src_path, dest_path)
+            logger.info(f"Copying file {src_path} to {dest_path}")
+            shutil.copy(src_path, dest_path)
             file = DLUFile(name=dest_file_name, path=dest_package_directory,
                            checksum=calculate_checksum(dest_path), size=os.path.getsize(dest_path))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
logger.info("Renaming file " + os.path.join(source_package_directory, file.name) + " to " + renamed_src_path)
os.rename(os.path.join(source_package_directory, file.name), renamed_src_path)
logger.info("Copying file " + renamed_src_path + " to " + dest_path)
shutil.copy(renamed_src_path, dest_path)
for file in file_list:
if file.name not in slide_name_map:
raise ValueError(f"File '{file.name}' not found in slide_name_map")
dest_file_name = slide_name_map[file.name]
src_path = os.path.join(source_package_directory, file.name)
dest_path = os.path.join(dest_package_directory, dest_file_name)
logger.info(f"Copying file {src_path} to {dest_path}")
shutil.copy(src_path, dest_path)
file = DLUFile(name=dest_file_name, path=dest_package_directory,
checksum=calculate_checksum(dest_path), size=os.path.getsize(dest_path))

file = DLUFile(name=dest_file_name, path=dest_package_directory,
checksum=calculate_checksum(dest_path), size=os.path.getsize(dest_path))
dluFiles.append(file)
return dluFiles
Comment on lines 128 to 140

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Add error handling and validate unique destination names.

The method lacks error handling around file operations and doesn't check for duplicate destination filenames. This can cause:

  1. Silent overwrites: If multiple source files map to the same dest_file_name, later files will overwrite earlier ones without warning
  2. Unhandled exceptions: Any I/O error (permissions, disk full, etc.) will crash without cleanup
  3. Partial state: Failed operations leave destination directory in an inconsistent state
🛡️ Proposed fix with validation and error handling
     def rename_and_move_files(self, file_list: list[DLUFile], slide_name_map, package_id ):
         dluFiles = []
         dest_package_directory = os.path.join(self.dlu_data_directory, self.dlu_package_dir_prefix + package_id)
         if os.path.exists(dest_package_directory):
             shutil.rmtree(dest_package_directory)
         if not os.path.exists(dest_package_directory):
             logger.info("Creating directory " + dest_package_directory)
             os.makedirs(dest_package_directory, exist_ok=True)
 
         source_package_directory = self.globus_data_directory + '/' + self.globus_dir_prefix + package_id
+        
+        # Validate no duplicate destination names
+        dest_names = set()
         for file in file_list:
+            if file.name not in slide_name_map:
+                raise ValueError(f"File '{file.name}' not found in slide_name_map")
             dest_file_name = slide_name_map[file.name]
+            if dest_file_name in dest_names:
+                raise ValueError(f"Duplicate destination filename: '{dest_file_name}'")
+            dest_names.add(dest_file_name)
+            
+        # Process files with error handling
+        for file in file_list:
+            dest_file_name = slide_name_map[file.name]
-            renamed_src_path = os.path.join(source_package_directory, dest_file_name)
+            src_path = os.path.join(source_package_directory, file.name)
             dest_path = os.path.join(dest_package_directory, dest_file_name)
-
-            logger.info("Renaming file " + os.path.join(source_package_directory, file.name) + " to " + renamed_src_path)
-            os.rename(os.path.join(source_package_directory, file.name), renamed_src_path)
-            logger.info("Copying file " + renamed_src_path + " to " + dest_path)
-            shutil.copy(renamed_src_path, dest_path)
-            file = DLUFile(name=dest_file_name, path=dest_package_directory,
+            
+            try:
+                logger.info(f"Copying file {src_path} to {dest_path}")
+                shutil.copy(src_path, dest_path)
+            except (IOError, OSError) as e:
+                logger.error(f"Failed to copy {src_path} to {dest_path}: {e}")
+                raise
+            
+            dlu_file = DLUFile(name=dest_file_name, path=dest_package_directory,
                            checksum=calculate_checksum(dest_path), size=os.path.getsize(dest_path))
-            dluFiles.append(file)
+            dluFiles.append(dlu_file)
         return dluFiles


Expand Down
Loading