Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ logs
env

*.done

.snakemake
.profiles

pixi.lock
uv.lock
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ apptainer exec \
The full data generation, model training, and validation workflow are managed using [Pixi](https://pixi.sh/) for environment management and [Snakemake](https://snakemake.readthedocs.io/) for job orchestration.

```bash
#ensure all gen configs are downloaded
git submodule update --init --recursive

# install pixi, restart your shell or source your .bashrc after this. only do once.
curl -fsSL https://pixi.sh/install.sh | bash

Expand Down
8 changes: 5 additions & 3 deletions configs/tallinn/pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@ sleep $((RANDOM % 60))
# Run the job
stdbuf -oL -eL {exec_job}

#to exclude broken nodes, specify default-resources:
# - slurm_extra="'--exclude=comp-e-002,comp-s-002'"
# Cleanup
rm -rf $TMPDIR' > .profiles/tallinn/jobscript.sh \
&& echo 'executor: "slurm"
jobscript: "jobscript.sh"
jobs: unlimited
restart-times: 0
max-jobs-per-timespan: 20/1s
max-jobs-per-timespan: 5/1s
max-status-checks-per-second: 2
verbose: true
use-conda: false
Expand All @@ -73,8 +75,8 @@ default-resources:

snakefile = { cmd = "bash -c 'PYTHONPATH=. python3 mlpf/produce_snakemake.py --production ${PROD:-cms_run3} \"$@\"' --", depends-on = ["setup"], description = "Generate the snakemake files" }
all = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} ${BATCH:+--batch all=$BATCH} all \"$@\"' --", depends-on = ["snakefile"], description = "Run the full production pipeline" }
gen = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_gen --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} ${BATCH:+--batch all=$BATCH} gen \"$@\"' --", depends-on = ["snakefile"], description = "Generate samples" }
post = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_post --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} ${BATCH:+--batch all=$BATCH} post \"$@\"' --", depends-on = ["snakefile"], description = "Post-process samples" }
gen = { cmd = "bash -c 'snakemake --keep-going --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_gen --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} ${BATCH:+--batch all=$BATCH} gen \"$@\"' --", depends-on = ["snakefile"], description = "Generate samples" }
post = { cmd = "bash -c 'snakemake --keep-going --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_post --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} ${BATCH:+--batch all=$BATCH} post \"$@\"' --", depends-on = ["snakefile"], description = "Post-process samples" }
tfds = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_tfds --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} tfds \"$@\"' --", depends-on = ["snakefile"], description = "Create TFDS datasets" }
tfds_hit = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_tfds_hit --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} tfds_hit \"$@\"' --", depends-on = ["snakefile"], description = "Create TFDS datasets with hits" }
train = { cmd = "bash -c 'snakemake --workflow-profile $PROFILE --snakefile snakemake_jobs/${PROD:-cms_run3}/Snakefile_train --use-apptainer --apptainer-args \"$APPTAINER_ARGS\" --cores ${CORES:-1} \"$@\"' --", depends-on = ["snakefile"], description = "Train the model" }
Expand Down
10 changes: 10 additions & 0 deletions mlpf/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def get_names(cls):
Dataset.CMS.value: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
Dataset.CLIC.value: [0, 1, 2], # 1 - track, 2 - cluster
Dataset.CLD.value: [0, 1, 2], # 1 - track, 2 - cluster
Dataset.CLIC_HITS.value: [0, 1, 2], # 1 - tracker hit, 2 - calorimeter hit
Dataset.CLD_HITS.value: [0, 1, 2], # 1 - tracker hit, 2 - calorimeter hit
}

Expand All @@ -194,6 +195,7 @@ def get_names(cls):
Dataset.CMS.value: [1, 4, 5, 6, 8, 9, 10, 11],
Dataset.CLIC.value: [1, 2],
Dataset.CLD.value: [1, 2],
Dataset.CLIC_HITS.value: [1, 2],
Dataset.CLD_HITS.value: [1, 2],
}

Expand Down Expand Up @@ -287,6 +289,7 @@ def get_names(cls):
],
Dataset.CLIC.value: get_edm4hep_x_features(),
Dataset.CLD.value: get_edm4hep_x_features(),
Dataset.CLIC_HITS.value: EDM4HEP.HitFeatures.get_names(),
Dataset.CLD_HITS.value: EDM4HEP.HitFeatures.get_names(),
}

Expand All @@ -311,6 +314,13 @@ def get_names(cls):
"ptcut": 5.0,
"match_dr": 0.1,
},
Dataset.CLIC_HITS.value: {
"algo": "ee_genkt_algorithm",
"r": 0.4,
"p": -1.0,
"ptcut": 5.0,
"match_dr": 0.1,
},
Dataset.CLD_HITS.value: {
"algo": "ee_genkt_algorithm",
"r": 0.4,
Expand Down
20 changes: 20 additions & 0 deletions mlpf/data/cms/genjob_pu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,26 @@ cleanup() {
}
trap cleanup EXIT

# Staggered start to avoid thundering herd on CVMFS
sleep $((1 + RANDOM % 60))

# Pre-flight CVMFS check
CVMFS_PATH="/cvmfs/cms.cern.ch/cmsset_default.sh"
SUCCESS=0
for delay in 5 10 30; do
if [ -f "$CVMFS_PATH" ]; then
SUCCESS=1
break
fi
echo "CVMFS not found at $CVMFS_PATH on $(hostname). Retrying in ${delay}s..."
sleep $delay
done

if [ $SUCCESS -eq 0 ]; then
echo "Error: CVMFS not available on $(hostname) after multiple retries."
exit 1
fi

env
source /cvmfs/cms.cern.ch/cmsset_default.sh

Expand Down
59 changes: 39 additions & 20 deletions mlpf/data/key4hep/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,7 @@ def compute_jets(particles_p4: Any, min_pt: float = jet_ptcut, with_indices: boo
return ret


def process_one_file(fn: str, ofn: str, first_event: int = 0, num_events: int = -1) -> None:
def process_one_file(fn: str, ofn: str, detector: str, first_event: int = 0, num_events: int = -1) -> None:

# output exists, do not recreate
if os.path.isfile(ofn) and num_events == -1 and first_event == 0:
Expand Down Expand Up @@ -1175,23 +1175,41 @@ def process_one_file(fn: str, ofn: str, first_event: int = 0, num_events: int =
idx_rp_to_cluster = arrs["_PandoraPFOs_clusters/_PandoraPFOs_clusters.index"].array()
idx_rp_to_track = arrs["_PandoraPFOs_tracks/_PandoraPFOs_tracks.index"].array()

hit_collections = [
"ECALBarrel",
"ECALEndcap",
# ECALOther is missing in CLD, but there in CLIC. Need to check and make this configurable later.
# "ECALOther",
"HCALBarrel",
"HCALEndcap",
"HCALOther",
"MUON",
"LumiCalHits",
"ITrackerHits",
"ITrackerEndcapHits",
"OTrackerHits",
"OTrackerEndcapHits",
"VXDTrackerHits",
"VXDEndcapTrackerHits",
]
if detector == "clic":
hit_collections = [
"ECALBarrel",
"ECALEndcap",
"ECALOther",
"HCALBarrel",
"HCALEndcap",
"HCALOther",
"MUON",
"LumiCal_Hits",
"ITrackerHits",
"ITrackerEndcapHits",
"OTrackerHits",
"OTrackerEndcapHits",
"VXDTrackerHits",
"VXDEndcapTrackerHits",
]
elif detector == "cld":
hit_collections = [
"ECALBarrel",
"ECALEndcap",
"HCALBarrel",
"HCALEndcap",
"HCALOther",
"MUON",
"ITrackerHits",
"ITrackerEndcapHits",
"OTrackerHits",
"OTrackerEndcapHits",
"VXDTrackerHits",
"VXDEndcapTrackerHits",
]
else:
raise ValueError(f"Unknown detector type: {detector}")

hit_data = {}
for k in hit_collections:
if k in arrs:
Expand Down Expand Up @@ -1433,6 +1451,7 @@ def parse_args() -> Any:
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, help="Input file ROOT file", required=True)
parser.add_argument("--outpath", type=str, default="raw", help="output path")
parser.add_argument("--detector", type=str, default="clic", help="detector type (clic, cld)")
parser.add_argument("--first-event", type=int, default=0, help="first event to process")
parser.add_argument("--num-events", type=int, default=-1, help="number of events to process")

Expand All @@ -1448,11 +1467,11 @@ def process(args: Any) -> None:
flist = glob.glob(args.input + "/*.root")
for infile in flist:
outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
process_one_file(infile, outfile, args.first_event, args.num_events)
process_one_file(infile, outfile, args.detector, args.first_event, args.num_events)
else:
infile = args.input
outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
process_one_file(infile, outfile, args.first_event, args.num_events)
process_one_file(infile, outfile, args.detector, args.first_event, args.num_events)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion mlpf/heptfds/cld_pf_edm4hep/qq.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from utils_edm import (
from mlpf.heptfds.edm4hep_utils.utils_pf import (
NUM_SPLITS,
X_FEATURES_CL,
X_FEATURES_TRK,
Expand Down
2 changes: 1 addition & 1 deletion mlpf/heptfds/cld_pf_edm4hep/ttbar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from utils_edm import (
from mlpf.heptfds.edm4hep_utils.utils_pf import (
NUM_SPLITS,
X_FEATURES_CL,
X_FEATURES_TRK,
Expand Down
2 changes: 1 addition & 1 deletion mlpf/heptfds/cld_pf_edm4hep/ww_fullhad.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from utils_edm import (
from mlpf.heptfds.edm4hep_utils.utils_pf import (
NUM_SPLITS,
X_FEATURES_CL,
X_FEATURES_TRK,
Expand Down
2 changes: 1 addition & 1 deletion mlpf/heptfds/cld_pf_edm4hep/zz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from utils_edm import (
from mlpf.heptfds.edm4hep_utils.utils_pf import (
NUM_SPLITS,
X_FEATURES_CL,
X_FEATURES_TRK,
Expand Down
9 changes: 6 additions & 3 deletions mlpf/heptfds/cld_pf_edm4hep_hits/qq.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from .utils_hits import (
from mlpf.conf import Dataset
from mlpf.heptfds.edm4hep_utils.utils_hits import (
NUM_SPLITS,
X_FEATURES,
Y_FEATURES,
Expand Down Expand Up @@ -69,9 +70,11 @@ def _info(self) -> tfds.core.DatasetInfo:
),
)

# Abstract method needs to be specified
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "p8_ee_qq_ecm365"), self.builder_config, num_splits=NUM_SPLITS)
return split_sample(Path(path / "p8_ee_qq_ecm365"), self.builder_config, num_splits=NUM_SPLITS, dataset=Dataset.CLD_HITS)

# Abstract method needs to be specified
def _generate_examples(self, files):
return generate_examples(files)
return generate_examples(files, Dataset.CLD_HITS)
9 changes: 6 additions & 3 deletions mlpf/heptfds/cld_pf_edm4hep_hits/ttbar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from .utils_hits import (
from mlpf.conf import Dataset
from mlpf.heptfds.edm4hep_utils.utils_hits import (
NUM_SPLITS,
X_FEATURES,
Y_FEATURES,
Expand Down Expand Up @@ -69,9 +70,11 @@ def _info(self) -> tfds.core.DatasetInfo:
),
)

# Abstract method needs to be specified
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "p8_ee_ttbar_ecm365"), self.builder_config, num_splits=NUM_SPLITS)
return split_sample(Path(path / "p8_ee_ttbar_ecm365"), self.builder_config, num_splits=NUM_SPLITS, dataset=Dataset.CLD_HITS)

# Abstract method needs to be specified
def _generate_examples(self, files):
return generate_examples(files)
return generate_examples(files, Dataset.CLD_HITS)
9 changes: 6 additions & 3 deletions mlpf/heptfds/cld_pf_edm4hep_hits/ww_fullhad.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from .utils_hits import (
from mlpf.conf import Dataset
from mlpf.heptfds.edm4hep_utils.utils_hits import (
NUM_SPLITS,
X_FEATURES,
Y_FEATURES,
Expand Down Expand Up @@ -69,9 +70,11 @@ def _info(self) -> tfds.core.DatasetInfo:
),
)

# Abstract method needs to be specified
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "p8_ee_WW_fullhad_ecm365"), self.builder_config, num_splits=NUM_SPLITS)
return split_sample(Path(path / "p8_ee_WW_fullhad_ecm365"), self.builder_config, num_splits=NUM_SPLITS, dataset=Dataset.CLD_HITS)

# Abstract method needs to be specified
def _generate_examples(self, files):
return generate_examples(files)
return generate_examples(files, Dataset.CLD_HITS)
9 changes: 6 additions & 3 deletions mlpf/heptfds/cld_pf_edm4hep_hits/zz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import os
import numpy as np
import tensorflow_datasets as tfds
from .utils_hits import (
from mlpf.conf import Dataset
from mlpf.heptfds.edm4hep_utils.utils_hits import (
NUM_SPLITS,
X_FEATURES,
Y_FEATURES,
Expand Down Expand Up @@ -69,9 +70,11 @@ def _info(self) -> tfds.core.DatasetInfo:
),
)

# Abstract method needs to be specified
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "p8_ee_ZZ_ecm365"), self.builder_config, num_splits=NUM_SPLITS)
return split_sample(Path(path / "p8_ee_ZZ_ecm365"), self.builder_config, num_splits=NUM_SPLITS, dataset=Dataset.CLD_HITS)

# Abstract method needs to be specified
def _generate_examples(self, files):
return generate_examples(files)
return generate_examples(files, Dataset.CLD_HITS)
7 changes: 5 additions & 2 deletions mlpf/heptfds/clic_pf_edm4hep/qq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path

import numpy as np
from utils_edm import X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, generate_examples, split_sample, NUM_SPLITS
from mlpf.heptfds.edm4hep_utils.utils_pf import X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, generate_examples, split_sample, NUM_SPLITS

import tensorflow_datasets as tfds

Expand All @@ -21,7 +21,7 @@


class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder):
VERSION = tfds.core.Version(os.environ.get("TFDS_VERSION", "3.0.0"))
VERSION = tfds.core.Version(os.environ.get("TFDS_VERSION", "3.1.0"))
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "update stats, move to 380 GeV",
Expand All @@ -36,6 +36,7 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder):
"2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352",
"2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets",
"3.0.0": "New generation with v1.2.4_key4hep_2025-05-29_CLIC_819e4e",
"3.1.0": "New generation with v1.2.5_key4hep_2025-05-29 500k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
For the raw input files in ROOT EDM4HEP format, please see the citation above.
Expand Down Expand Up @@ -81,9 +82,11 @@ def _info(self) -> tfds.core.DatasetInfo:
),
)

# Abstract method needs to be specified
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "p8_ee_qq_ecm380/"), self.builder_config, num_splits=NUM_SPLITS)

# Abstract method needs to be specified
def _generate_examples(self, files):
return generate_examples(files)
Loading
Loading