From 6f9582838c74d019231e63657482adda2a92d3f0 Mon Sep 17 00:00:00 2001
From: "tiffany.cheng" <tiffany.cheng@jisc.ac.uk>
Date: Fri, 7 Nov 2025 11:50:54 +0000
Subject: [PATCH 1/4] build: update to 0.60

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1ba179a..e989d3a 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='mario-pipeline-tools',
-    version='0.59',
+    version='0.60',
     packages=['mario'],
     url='https://github.com/JiscDACT/mario',
     license='all rights reserved',

From eb9c8291f8dd51134cd0f0941bdc5fb413cb56fe Mon Sep 17 00:00:00 2001
From: "tiffany.cheng" <tiffany.cheng@jisc.ac.uk>
Date: Fri, 7 Nov 2025 11:52:00 +0000
Subject: [PATCH 2/4] Update: Change sql query code from using offset/limit to
 use BETWEEN

---
 mario/hyper_utils.py        | 24 ++++++++++++++++--------
 test/test_data_extractor.py |  4 ++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/mario/hyper_utils.py b/mario/hyper_utils.py
index 9b24f47..8c737dd 100644
--- a/mario/hyper_utils.py
+++ b/mario/hyper_utils.py
@@ -347,8 +347,6 @@ def save_hyper_as_csv(hyper_file: str, file_path: str, **kwargs):
 
         # Get column names
         column_names = ','.join(f'"{column}"' for column in columns)
-        sql = f"SELECT {column_names} FROM \"{schema}\".\"{table}\" ORDER BY row_number"
-        offset = 0
 
         if options.use_pantab:
             # Use pantab to stream hyper to csv
@@ -356,6 +354,8 @@ def save_hyper_as_csv(hyper_file: str, file_path: str, **kwargs):
 
             mode = 'w'
             header = True
+            sql = f"SELECT {column_names} FROM \"{schema}\".\"{table}\" ORDER BY row_number"
+            offset = 0
 
             while True:
                 query = f"{sql} LIMIT {options.chunk_size} OFFSET {offset}"
@@ -375,22 +375,30 @@ def save_hyper_as_csv(hyper_file: str, file_path: str, **kwargs):
             with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, 'test') as hyper:
                 with Connection(endpoint=hyper.endpoint, database=temp_hyper) as connection:
 
+                    # Get min and max value from row_number
+                    sql_range = f"SELECT MIN(row_number), MAX(row_number) FROM \"{schema}\".\"{table}\""
+                    value_range = connection.execute_query(sql_range)
+                    start_val, end_val = list(value_range)[0]
+                    logging.info(f"Get row_number range: [{str(start_val)}, {str(end_val)}]")
+
+                    sql = f"SELECT {column_names} FROM \"{schema}\".\"{table}\""
+
                     with open_func(file_path, mode, newline='', encoding="utf-8") as f:
 
                         writer = csv.writer(f)
                         # write header
                         writer.writerow(columns)
 
-                        while True:
-                            query = f"{sql} LIMIT {options.chunk_size} OFFSET {offset}"
+                        while start_val <= end_val:
+                            chunk_end = min(start_val+options.chunk_size-1, end_val)
+                            query = f"{sql} WHERE row_number BETWEEN {start_val} AND {chunk_end}"
+                            logging.info(f"Query between {start_val} and {chunk_end}")
+
                             result = connection.execute_query(query)
 
                             rows = list(result)
-                            if not rows:
-                                break
-
                             writer.writerows(rows)
-                            offset += options.chunk_size
+                            start_val += options.chunk_size
 
 
 def save_dataframe_as_hyper(df, file_path, **kwargs):
diff --git a/test/test_data_extractor.py b/test/test_data_extractor.py
index e5e8587..34d38a0 100644
--- a/test/test_data_extractor.py
+++ b/test/test_data_extractor.py
@@ -585,7 +585,7 @@ def test_hyper_to_csv_without_copy_to_tmp():
         do_not_modify_source=False
     )
     assert extractor.get_total() == 10194
-    assert extractor.get_total(measure='Sales') == 2326534.3542999607
+    assert extractor.get_total(measure='Sales') == 2326534.3542999597
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543
@@ -618,7 +618,7 @@ def test_hyper_to_csv_without_using_pantab():
         use_pantab=False
     )
     assert extractor.get_total() == 10194
-    assert extractor.get_total(measure='Sales') == 2326534.3542999607
+    assert extractor.get_total(measure='Sales') == 2326534.3542999597
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543

From ac3ad9a0d114c06481a9097091b263c648632a8a Mon Sep 17 00:00:00 2001
From: "tiffany.cheng" <tiffany.cheng@jisc.ac.uk>
Date: Fri, 7 Nov 2025 12:24:59 +0000
Subject: [PATCH 3/4] Fix: round pytest sum of Sales number

---
 test/test_data_extractor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_data_extractor.py b/test/test_data_extractor.py
index 34d38a0..f01030a 100644
--- a/test/test_data_extractor.py
+++ b/test/test_data_extractor.py
@@ -553,7 +553,7 @@ def test_hyper_to_csv():
         compress_using_gzip=False
     )
     assert extractor.get_total() == 10194
-    assert extractor.get_total(measure='Sales') == 2326534.354299952
+    assert round(extractor.get_total(measure='Sales'), 4) == 2326534.3543
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543
@@ -585,7 +585,7 @@ def test_hyper_to_csv_without_copy_to_tmp():
         do_not_modify_source=False
     )
     assert extractor.get_total() == 10194
-    assert extractor.get_total(measure='Sales') == 2326534.3542999597
+    assert round(extractor.get_total(measure='Sales'), 4) == 2326534.3543
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543

From 19febaf183ac8ff7e91f216fc89283187f086b38 Mon Sep 17 00:00:00 2001
From: "tiffany.cheng" <tiffany.cheng@jisc.ac.uk>
Date: Fri, 7 Nov 2025 12:39:21 +0000
Subject: [PATCH 4/4] Fix: round pytest sum of Sales number to 2 d.p.

---
 test/test_data_extractor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_data_extractor.py b/test/test_data_extractor.py
index f01030a..0e146cf 100644
--- a/test/test_data_extractor.py
+++ b/test/test_data_extractor.py
@@ -553,7 +553,7 @@ def test_hyper_to_csv():
         compress_using_gzip=False
     )
     assert extractor.get_total() == 10194
-    assert round(extractor.get_total(measure='Sales'), 4) == 2326534.3543
+    assert round(extractor.get_total(measure='Sales'), 2) == 2326534.35
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543
@@ -585,7 +585,7 @@ def test_hyper_to_csv_without_copy_to_tmp():
         do_not_modify_source=False
     )
     assert extractor.get_total() == 10194
-    assert round(extractor.get_total(measure='Sales'), 4) == 2326534.3543
+    assert round(extractor.get_total(measure='Sales'), 2) == 2326534.35
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543
@@ -618,7 +618,7 @@ def test_hyper_to_csv_without_using_pantab():
         use_pantab=False
     )
     assert extractor.get_total() == 10194
-    assert extractor.get_total(measure='Sales') == 2326534.3542999597
+    assert round(extractor.get_total(measure='Sales'), 2) == 2326534.35
 
     df = pd.read_csv(output_file)
     assert round(df['Sales'].sum(), 4) == 2326534.3543