ccapetz · Vyomkesh13 · Jul 14, 2025 · Jul 21, 2025 · Aug 4, 2025 · Sep 1, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 /workspaces/*
 !/workspaces/public_datasets
+!workspaces/ATLAS_Workspace
 !workspaces/CMS_project_v1/CMS_project_v1/config/CMS_project_v1_config.py
 !workspaces/CFD_workspace/CFD_project_animation/config/CFD_project_animation_config.py
 

diff --git a/README.md b/README.md
@@ -10,6 +10,56 @@ Baler is a tool used to test the feasibility of compressing different types of s
 3. Decompress the file using the model at a later time
 4. Plot the performance of the compression/decompression
 
+# Recent Major Contributions
+
+## Physics-Informed Autoencoder Models for High-Energy Physics Data (Carter Capetz)
+
+### Overview
+Significant enhancements have been made to Baler to support physics-informed machine learning for high-energy physics data compression, particularly focused on ATLAS experiment diphoton data analysis.
+
+### Key Contributions
+
+#### 1. Enhanced Autoencoder Architecture
+- **Physics-Informed Loss Functions**: Implemented specialized loss functions that preserve physical quantities during compression/decompression
+- **Diphoton Invariant Mass Preservation**: Added `mse_loss_myy_l1` function that maintains the invariant mass of diphoton systems
+- **Enhanced Model Architectures**: Extended the models module with physics-aware autoencoder variants
+
+#### 2. ATLAS Workspace Integration
+- **Complete ATLAS Data Pipeline**: Full integration with ATLAS Open Data Portal for GamGam (diphoton) analysis
+- **Data Preprocessing**: Comprehensive notebooks for processing ATLAS 2015-2016 data with physics-based cuts
+- **Physics Validation**: Tools for validating compressed data against physical constraints
+- **Multi-Period Analysis**: Support for analyzing data across multiple ATLAS data-taking periods
+
+#### 3. Advanced Data Processing
+- **ROOT File Integration**: Seamless integration with CERN ROOT files via uproot
+- **Physics-Based Filtering**: Implementation of ATLAS photon selection criteria:
+  - Photon reconstruction quality cuts
+  - Transverse momentum requirements (leading > 50 GeV, sub-leading > 30 GeV)
+  - Calorimeter isolation cuts (< 5.5%)
+  - Pseudorapidity transition region exclusions
+  - Invariant mass-based isolation requirements
+
+#### 4. Enhanced Core Modules
+- **Models Module**: Extended with physics-aware autoencoder architectures
+- **Utils Module**: Added physics calculation functions and specialized loss functions
+- **Helper Module**: Enhanced with improved data handling and validation
+- **Training Module**: Optimized for physics data with custom loss functions
+
+#### 5. Scientific Validation
+- **Invariant Mass Preservation**: Ensures compressed data maintains physical meaning
+- **Performance Metrics**: Comprehensive evaluation of compression quality vs. physics preservation
+- **Visualization Tools**: Enhanced plotting capabilities for physics analysis results
+
+### Use Cases
+- **ATLAS Diphoton Analysis**: Full pipeline from raw ROOT data to compressed representations
+- **Physics Data Compression**: General framework for compressing scientific data while preserving physical constraints
+- **High-Energy Physics**: Specialized tools for particle physics data analysis
+
+### Technical Details
+- **Data Format**: Support for .npz compressed data with physics metadata
+- **Model Architecture**: Flexible autoencoder designs with physics-informed loss functions
+- **Validation**: Comprehensive testing of compressed data against physical constraints
+- **Scalability**: Designed to handle large-scale physics datasets
 
 # Getting Started #
 **NOTE:** For the same performance and version as presented in our [Arxiv](https://arxiv.org/abs/2305.02283) paper, please use release [v1.0.0](https://github.com/baler-collaboration/baler/tree/v1.0.0) and the setup instructions given there. v1.0.0 also has a working docker implementation. We are currently experiencing some performance issues on the main branch compared.
@@ -18,8 +68,6 @@ In the links below we offer instructions on how to set up Baler and working tuto
 * [Python](docs/setup/python_setup.md)
 * [Docker/Singularity/Apptainer](docs/setup/docker_setup.md)
 
-
-
 # Contributing
 
 If you wish to contribute, please see the [contribution guidelines](https://github.com/baler-collaboration/baler/blob/main/docs/CONTRIBUTING.md).
diff --git a/baler/modules/data_processing.py b/baler/modules/data_processing.py
@@ -131,25 +131,21 @@ def find_minmax(data):
 
 
 def normalize(data, custom_norm: bool):
-    """This function scales the data to be in the range [0,1], based on the Min Max normalization method. It finds
-    the minimum and maximum values of each column and computes the values according to: x_norm = (x - x_min) / (x_max
-    - x_min).
-
-    Args: data (ndarray): A dataset of type `ndarray`. custom_norm (boolean): If you want to do Min Max normalization
-    or any custom normalization. Custom normalization is not supported at the moment.
-
-    Returns: ndarray: If not custom_norm: Input data where every column is scaled to be in the range [0,
-    1]. Otherwise, the input data is returned
+    """This function scales the data to be in the range [0,1], based on the Min Max normalization method.
     """
-    data = np.array(data)
+    # Convert to float32 first to avoid uint8 division issues
+    data = np.array(data, dtype=np.float32)
+
     if custom_norm:
         pass
     elif not custom_norm:
         true_min = np.min(data)
         true_max = np.max(data)
         feature_range = true_max - true_min
-        data = [((i - true_min) / feature_range) for i in data]
-        data = np.array(data)
+        # Avoid division by zero
+        if feature_range == 0:
+            return data - true_min
+        data = (data - true_min) / feature_range
     return data
 
 

diff --git a/baler/modules/helper.py b/baler/modules/helper.py
@@ -433,6 +433,9 @@ def get_device():
     if torch.cuda.is_available():
         dev = "cuda:0"
         device = torch.device(dev)
+    # elif torch.backends.mps.is_available():
+    #     dev = "mps"
+    #     device = torch.device(dev)
     else:
         dev = "cpu"
         device = torch.device(dev)
@@ -584,6 +587,9 @@ def compress(model_path, config):
         for idx, data_batch in enumerate(tqdm(data_dl)):
             data_batch = data_batch.to(device)
 
+            #float32
+            data_batch = data_batch.to(torch.float32)
+
             compressed_output = model.encode(data_batch)
 
             if config.save_error_bounded_deltas:

diff --git a/baler/modules/mnist_plotting.py b/baler/modules/mnist_plotting.py
@@ -0,0 +1,47 @@
+# import matplotlib.pyplot as plt
+# import numpy as np
+# from tqdm import trange
+
+# def plot_mnist_results(project_path, config):
+#     """Plots MNIST results in a grid showing original, reconstructed, and difference."""
+#     print("=== Plotting MNIST Results ===")
+
+#     # Load data
+#     data = np.load(config.input_path)["data"]
+#     data_decompressed = np.load(project_path + "/decompressed_output/decompressed.npz")["data"]
+
+#     # Determine number of samples to plot
+#     n_samples = min(getattr(config, 'max_plot_samples', 100), len(data))
+#     n_rows = min(10, n_samples)
+#     n_cols = 3  # Original, Reconstructed, Difference
+
+#     # Create figure
+#     fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 1.5*n_rows))
+#     fig.suptitle('MNIST Autoencoder Results', fontsize=16)
+
+#     # Plot images
+#     for i in range(n_rows):
+#         # Original
+#         axes[i,0].imshow(data[i].reshape(28,28), cmap='gray')
+#         axes[i,0].axis('off')
+#         if i == 0:
+#             axes[i,0].set_title('Original')
+
+#         # Reconstructed
+#         axes[i,1].imshow(data_decompressed[i].reshape(28,28), cmap='gray')
+#         axes[i,1].axis('off')
+#         if i == 0:
+#             axes[i,1].set_title('Reconstructed')
+
+#         # Difference
+#         diff = data[i] - data_decompressed[i]
+#         axes[i,2].imshow(diff.reshape(28,28), cmap='RdBu', center=0)
+#         axes[i,2].axis('off')
+#         if i == 0:
+#             axes[i,2].set_title('Difference')
+
+#     plt.tight_layout()
+#     plt.savefig(project_path + "/plotting/mnist_results.png", bbox_inches='tight', dpi=150)
+#     plt.close()
+
+#     print(f"Results saved to {project_path}/plotting/mnist_results.png")
diff --git a/baler/modules/models.py b/baler/modules/models.py
@@ -125,15 +125,28 @@ def __init__(self, n_features, z_dim, *args, **kwargs):
         self.activations = {}
 
         # encoder
-        self.en1 = nn.Linear(n_features, 200, dtype=torch.float64)
-        self.en2 = nn.Linear(200, 100, dtype=torch.float64)
-        self.en3 = nn.Linear(100, 50, dtype=torch.float64)
-        self.en4 = nn.Linear(50, z_dim, dtype=torch.float64)
+        self.en1 = nn.Linear(n_features, 200, dtype=torch.float32)
+        self.en2 = nn.Linear(200, 100, dtype=torch.float32)
+        self.en3 = nn.Linear(100, 50, dtype=torch.float32)
+        self.en4 = nn.Linear(50, z_dim, dtype=torch.float32)
+
+        # Initialize weights properly
+        nn.init.xavier_uniform_(self.en1.weight)
+        nn.init.xavier_uniform_(self.en2.weight)
+        nn.init.xavier_uniform_(self.en3.weight)
+        nn.init.xavier_uniform_(self.en4.weight)
+
         # decoder
-        self.de1 = nn.Linear(z_dim, 50, dtype=torch.float64)
-        self.de2 = nn.Linear(50, 100, dtype=torch.float64)
-        self.de3 = nn.Linear(100, 200, dtype=torch.float64)
-        self.de4 = nn.Linear(200, n_features, dtype=torch.float64)
+        self.de1 = nn.Linear(z_dim, 50, dtype=torch.float32)
+        self.de2 = nn.Linear(50, 100, dtype=torch.float32)
+        self.de3 = nn.Linear(100, 200, dtype=torch.float32)
+        self.de4 = nn.Linear(200, n_features, dtype=torch.float32)
+
+        # Initialize weights properly
+        nn.init.xavier_uniform_(self.de1.weight)
+        nn.init.xavier_uniform_(self.de2.weight)
+        nn.init.xavier_uniform_(self.de3.weight)
+        nn.init.xavier_uniform_(self.de4.weight)
 
         self.n_features = n_features
         self.z_dim = z_dim
@@ -412,18 +425,18 @@ def __init__(self, n_features, z_dim, *args, **kwargs):
         super(FPGA_prototype_model, self).__init__(*args, **kwargs)
 
         # encoder
-        self.en1 = nn.Linear(n_features, 20, dtype=torch.float64)
+        self.en1 = nn.Linear(n_features, 20, dtype=torch.float32)
         self.en_act1 = nn.ReLU()
-        self.en2 = nn.Linear(20, 10, dtype=torch.float64)
+        self.en2 = nn.Linear(20, 10, dtype=torch.float32)
         self.en_act2 = nn.ReLU()
-        self.en3 = nn.Linear(10, z_dim, dtype=torch.float64)
+        self.en3 = nn.Linear(10, z_dim, dtype=torch.float32)
 
         # decoder
-        self.de1 = nn.Linear(z_dim, 10, dtype=torch.float64)
+        self.de1 = nn.Linear(z_dim, 10, dtype=torch.float32)
         self.de_act1 = nn.ReLU()
-        self.de2 = nn.Linear(10, 20, dtype=torch.float64)
+        self.de2 = nn.Linear(10, 20, dtype=torch.float32)
         self.de_act2 = nn.ReLU()
-        self.de3 = nn.Linear(20, n_features, dtype=torch.float64)
+        self.de3 = nn.Linear(20, n_features, dtype=torch.float32)
 
         self.n_features = n_features
         self.z_dim = z_dim

diff --git a/baler/modules/plotting.py b/baler/modules/plotting.py
@@ -19,6 +19,7 @@
 from matplotlib.backends.backend_pdf import PdfPages
 from tqdm import tqdm
 from tqdm import trange
+from . import mnist_plotting
 
 
 def loss_plot(path_to_loss_data, output_path, config):
@@ -409,8 +410,14 @@ def plot_2D(project_path, config):
         #     tile_data_decompressed = data_decompressed[ind][0]
         # elif config.model_type == "dense":
         #     tile_data_decompressed = data_decompressed[ind][0]
-        tile_data = data[ind]
-        tile_data_decompressed = data_decompressed[ind]
+        for ind in trange(num_tiles):
+        # For MNIST data, we need to handle the extra dimension
+            if config.model_type == "convolutional":
+                tile_data = data[ind].squeeze()  # Remove the extra dimension
+                tile_data_decompressed = data_decompressed[ind].squeeze()  # Remove the extra dimension
+            else:
+                tile_data = data[ind]
+                tile_data_decompressed = data_decompressed[ind]
 
         diff = tile_data - tile_data_decompressed
 
@@ -438,12 +445,7 @@ def plot_2D(project_path, config):
 
 
 def plot(output_path, config):
-    """Runs the appropriate plotting function based on the data dimension 1D or 2D
-
-    Args:
-        output_path (path): The path to the project directory
-        config (dataclass): The config class containing attributes set in the config file
-    """
+    """Runs the appropriate plotting function based on the data dimension 1D or 2D"""
     if config.data_dimension == 1:
         plot_1D(output_path, config)
     elif config.data_dimension == 2:

diff --git a/baler/modules/training.py b/baler/modules/training.py
@@ -85,7 +85,8 @@ def fit(
                 true_data=inputs,
                 reconstructed_data=reconstructions,
                 reg_param=regular_param,
-                validate=True,
+                #originally true, trying false
+                validate=False,
             )
 
         # Compute the loss-gradient with
@@ -128,7 +129,8 @@ def validate(model, test_dl, model_children, reg_param):
                 true_data=inputs,
                 reconstructed_data=reconstructions,
                 reg_param=reg_param,
-                validate=True,
+                #originally true, trying false
+                validate=False,
             )
             running_loss += loss.item()
 
@@ -227,8 +229,8 @@ def train(model, variables, train_data, test_data, project_path, config):
                 train_data.shape[0], 1, train_data.shape[1], train_data.shape[2]
             )
     elif config.data_dimension == 1:
-        train_ds = torch.tensor(train_data, dtype=torch.float64, device=device)
-        valid_ds = torch.tensor(test_data, dtype=torch.float64, device=device)
+        train_ds = torch.tensor(train_data, dtype=torch.float32, device=device)
+        valid_ds = torch.tensor(test_data, dtype=torch.float32, device=device)
 
     # Pushing input data into the torch-DataLoader object and combines into one DataLoader object (a basic wrapper
     # around several DataLoader objects).