From a1b14ae6ec9d025c61853e565c9780177989fcae Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Fri, 10 Apr 2026 10:30:47 -0700 Subject: [PATCH 01/15] Fix ORDER BY with named aggregates that expand to subqueries (#28) When AGGREGATE(...) AT (...) expressions expand to correlated subqueries and are referenced by alias in ORDER BY, DuckDB errors with "Alias has subquery - not yet supported." Fix by wrapping the inner query in a subquery when ORDER BY references a subquery-aliased SELECT item. --- test/sql/measures.test | 43 +++++++++ yardstick-rs/src/sql/measures.rs | 159 +++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) diff --git a/test/sql/measures.test b/test/sql/measures.test index 5bae109..ba8f4ff 100644 --- a/test/sql/measures.test +++ b/test/sql/measures.test @@ -1771,3 +1771,46 @@ FROM sales_v WHERE year = 2023; ---- 150.0 + +# ORDER BY with named aggregate that expands to a subquery (GH #28) +query IIRR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AS revenue, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY revenue/year_total, year, region; +---- +2022 EU 50.0 150.0 +2023 EU 75.0 225.0 +2022 US 100.0 150.0 +2023 US 150.0 225.0 + +# ORDER BY with simple alias of subquery expression (GH #28) +query IIR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY year_total, year, region; +---- +2022 EU 150.0 +2022 US 150.0 +2023 EU 225.0 +2023 US 225.0 + +# ORDER BY DESC with subquery alias (GH #28) +query IIR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY year_total DESC, year, region; +---- +2023 EU 225.0 +2023 US 225.0 +2022 EU 150.0 +2022 US 150.0 diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 4328a54..708c198 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5167,6 +5167,118 @@ fn validate_set_expression_requirements( None } +/// Extract alias names of SELECT items whose expressions contain subqueries. +fn extract_subquery_aliases_from_select(sql: &str) -> Vec { + let from_pos = find_top_level_keyword(sql, "FROM", 0).unwrap_or(sql.len()); + let upper = sql.to_uppercase(); + let select_start = upper.find("SELECT").map(|p| p + 6).unwrap_or(0); + let select_text = &sql[select_start..from_pos]; + + let mut aliases = Vec::new(); + let bytes = select_text.as_bytes(); + let mut depth: i32 = 0; + let mut item_start = 0; + + for i in 0..select_text.len() { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + b'\'' => { + // Skip single-quoted strings + let mut j = i + 1; + while j < select_text.len() { + if bytes[j] == b'\'' { + if j + 1 < select_text.len() && bytes[j + 1] == b'\'' { + j += 2; + } else { + break; + } + } else { + j += 1; + } + } + } + b',' if depth == 0 => { + if let Some(alias) = subquery_alias_from_item(&select_text[item_start..i]) { + aliases.push(alias); + } + item_start = i + 1; + } + _ => {} + } + } + // Last item + if let Some(alias) = subquery_alias_from_item(&select_text[item_start..]) { + aliases.push(alias); + } + + aliases +} + +/// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an AS alias. +fn subquery_alias_from_item(item: &str) -> Option { + let upper = item.to_uppercase(); + if !upper.contains("(SELECT ") { + return None; + } + // Find the last top-level " AS " (not inside parentheses) + let bytes = item.as_bytes(); + let mut depth: i32 = 0; + let mut last_as_pos: Option = None; + let mut i = 0; + while i < item.len() { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + _ if depth == 0 => { + if i + 4 <= item.len() && upper[i..].starts_with(" AS ") { + last_as_pos = Some(i + 4); + } + } + _ => {} + } + i += 1; + } + let as_pos = last_as_pos?; + let alias = item[as_pos..].trim(); + // Extract identifier: alphanumeric, underscore, or quoted + let alias = alias + .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '"' && c != '`') + .next() + .unwrap_or(alias) + .trim_matches('"') + .trim_matches('`'); + if alias.is_empty() { + None + } else { + Some(alias.to_string()) + } +} + +/// Check if an identifier appears as a whole word in text (case-insensitive). +fn identifier_appears_in(text: &str, ident: &str) -> bool { + let text_upper = text.to_uppercase(); + let ident_upper = ident.to_uppercase(); + let mut search_from = 0; + while let Some(pos) = text_upper[search_from..].find(&ident_upper) { + let abs = search_from + pos; + let before_ok = abs == 0 || { + let c = text.as_bytes()[abs - 1]; + !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' + }; + let after_pos = abs + ident_upper.len(); + let after_ok = after_pos >= text_upper.len() || { + let c = text.as_bytes()[after_pos]; + !c.is_ascii_alphanumeric() && c != b'_' + }; + if before_ok && after_ok { + return true; + } + search_from = abs + 1; + } + false +} + /// Expand AGGREGATE() with AT modifiers in SQL pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { let cte_expansion = expand_cte_queries(sql); @@ -5595,6 +5707,29 @@ pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { ); } + // If ORDER BY references an alias whose SELECT expression is a subquery, + // wrap the query to avoid DuckDB's "Alias has subquery" limitation. + if let Some(order_pos) = find_top_level_keyword(&result_sql, "ORDER BY", 0) { + let subquery_aliases = extract_subquery_aliases_from_select(&result_sql); + if !subquery_aliases.is_empty() { + let order_end = find_first_top_level_keyword( + &result_sql, + order_pos + 8, + &["LIMIT", "OFFSET"], + ) + .unwrap_or(result_sql.len()); + let order_text = &result_sql[order_pos + 8..order_end]; + let needs_wrap = subquery_aliases + .iter() + .any(|alias| identifier_appears_in(order_text, alias)); + if needs_wrap { + let trailing = result_sql[order_pos..].trim_end_matches(';').trim_end(); + let inner = result_sql[..order_pos].trim_end().trim_end_matches(';').trim_end(); + result_sql = format!("SELECT * FROM ({inner}) _q {trailing}"); + } + } + } + AggregateExpandResult { had_aggregate, expanded_sql: result_sql, @@ -7539,4 +7674,28 @@ GROUP BY s.year"; "FROM (SELECT * FROM (SELECT year, region FROM a UNION ALL SELECT year, region FROM b)) _inner" )); } + + #[test] + fn test_extract_subquery_aliases_from_select() { + let sql = "SELECT year, region, SUM(revenue) AS revenue, (SELECT SUM(revenue) FROM t _inner WHERE _inner.year IS NOT DISTINCT FROM _outer.year) AS year_total FROM sales_v _outer GROUP BY year, region"; + let aliases = extract_subquery_aliases_from_select(sql); + assert_eq!(aliases, vec!["year_total"]); + } + + #[test] + fn test_extract_subquery_aliases_no_subquery() { + let sql = "SELECT year, SUM(revenue) AS revenue FROM sales_v GROUP BY year"; + let aliases = extract_subquery_aliases_from_select(sql); + assert!(aliases.is_empty()); + } + + #[test] + fn test_identifier_appears_in() { + assert!(identifier_appears_in("revenue/year_total", "year_total")); + assert!(identifier_appears_in("revenue/year_total", "revenue")); + assert!(!identifier_appears_in("revenue/year_total", "year")); + assert!(identifier_appears_in("year_total DESC", "year_total")); + assert!(identifier_appears_in(" year_total", "year_total")); + assert!(!identifier_appears_in("o.prodName", "year_total")); + } } From e25309ea642d9b80fdca74f5389b97c97df71a60 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Fri, 10 Apr 2026 16:33:29 -0700 Subject: [PATCH 02/15] Use find_top_level_keyword for outer SELECT in subquery alias extraction Fixes a bug where extract_subquery_aliases_from_select would match a SELECT inside a CTE rather than the outer query's SELECT, causing the ORDER BY wrapping to be skipped for CTE-based queries. --- yardstick-rs/src/sql/measures.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 708c198..3ac95a2 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5170,8 +5170,7 @@ fn validate_set_expression_requirements( /// Extract alias names of SELECT items whose expressions contain subqueries. fn extract_subquery_aliases_from_select(sql: &str) -> Vec { let from_pos = find_top_level_keyword(sql, "FROM", 0).unwrap_or(sql.len()); - let upper = sql.to_uppercase(); - let select_start = upper.find("SELECT").map(|p| p + 6).unwrap_or(0); + let select_start = find_top_level_keyword(sql, "SELECT", 0).map(|p| p + 6).unwrap_or(0); let select_text = &sql[select_start..from_pos]; let mut aliases = Vec::new(); From 50e133e0954efc17977059a42674076b3bb807eb Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 07:02:08 -0700 Subject: [PATCH 03/15] Skip string literals and comments when detecting ORDER BY alias refs Fixes two issues from PR review: 1. extract_subquery_aliases_from_select did not advance past quoted strings, so commas inside literals could split SELECT items 2. identifier_appears_in matched alias names inside string literals and comments, potentially triggering unnecessary query wrapping --- yardstick-rs/src/sql/measures.rs | 102 ++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 3ac95a2..3397efb 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5177,23 +5177,23 @@ fn extract_subquery_aliases_from_select(sql: &str) -> Vec { let bytes = select_text.as_bytes(); let mut depth: i32 = 0; let mut item_start = 0; + let mut i = 0; - for i in 0..select_text.len() { + while i < select_text.len() { match bytes[i] { b'(' => depth += 1, b')' => depth -= 1, b'\'' => { - // Skip single-quoted strings - let mut j = i + 1; - while j < select_text.len() { - if bytes[j] == b'\'' { - if j + 1 < select_text.len() && bytes[j + 1] == b'\'' { - j += 2; + i += 1; + while i < select_text.len() { + if bytes[i] == b'\'' { + if i + 1 < select_text.len() && bytes[i + 1] == b'\'' { + i += 2; } else { break; } } else { - j += 1; + i += 1; } } } @@ -5205,6 +5205,7 @@ fn extract_subquery_aliases_from_select(sql: &str) -> Vec { } _ => {} } + i += 1; } // Last item if let Some(alias) = subquery_alias_from_item(&select_text[item_start..]) { @@ -5254,20 +5255,23 @@ fn subquery_alias_from_item(item: &str) -> Option { } } -/// Check if an identifier appears as a whole word in text (case-insensitive). +/// Check if an identifier appears as a whole word in text (case-insensitive), +/// skipping string literals and comments. fn identifier_appears_in(text: &str, ident: &str) -> bool { - let text_upper = text.to_uppercase(); + // Strip string literals and comments before scanning for the identifier. + let stripped = strip_literals_and_comments(text); + let stripped_upper = stripped.to_uppercase(); let ident_upper = ident.to_uppercase(); let mut search_from = 0; - while let Some(pos) = text_upper[search_from..].find(&ident_upper) { + while let Some(pos) = stripped_upper[search_from..].find(&ident_upper) { let abs = search_from + pos; let before_ok = abs == 0 || { - let c = text.as_bytes()[abs - 1]; + let c = stripped.as_bytes()[abs - 1]; !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' }; let after_pos = abs + ident_upper.len(); - let after_ok = after_pos >= text_upper.len() || { - let c = text.as_bytes()[after_pos]; + let after_ok = after_pos >= stripped_upper.len() || { + let c = stripped.as_bytes()[after_pos]; !c.is_ascii_alphanumeric() && c != b'_' }; if before_ok && after_ok { @@ -5278,6 +5282,60 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { false } +/// Replace string literals and comments with spaces to avoid false matches. +fn strip_literals_and_comments(text: &str) -> String { + let bytes = text.as_bytes(); + let mut out = text.to_string().into_bytes(); + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'\'' => { + out[i] = b' '; + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + } else { + out[i] = b' '; + i += 1; + break; + } + } else { + out[i] = b' '; + i += 1; + } + } + } + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + out[i] = b' '; + i += 1; + } + } + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + break; + } + out[i] = b' '; + i += 1; + } + } + _ => i += 1, + } + } + String::from_utf8(out).unwrap_or_else(|_| text.to_string()) +} + /// Expand AGGREGATE() with AT modifiers in SQL pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { let cte_expansion = expand_cte_queries(sql); @@ -7696,5 +7754,21 @@ GROUP BY s.year"; assert!(identifier_appears_in("year_total DESC", "year_total")); assert!(identifier_appears_in(" year_total", "year_total")); assert!(!identifier_appears_in("o.prodName", "year_total")); + // Should not match inside string literals + assert!(!identifier_appears_in("'year_total'", "year_total")); + assert!(!identifier_appears_in("'contains year_total inside'", "year_total")); + // Should not match inside comments + assert!(!identifier_appears_in("-- year_total\nrevenue", "year_total")); + assert!(!identifier_appears_in("/* year_total */ revenue", "year_total")); + // Should still match real identifiers alongside literals + assert!(identifier_appears_in("'literal' || year_total", "year_total")); + } + + #[test] + fn test_extract_subquery_aliases_with_string_in_item() { + // Comma inside a string literal should not split the item + let sql = "SELECT 'a,b' || (SELECT 1) AS x FROM t"; + let aliases = extract_subquery_aliases_from_select(sql); + assert_eq!(aliases, vec!["x"]); } } From 0b32f93d8379e3754ced7b039a54dc1253e9a4ff Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 07:32:01 -0700 Subject: [PATCH 04/15] Fix UTF-8 boundary panic and quoted qualifier false positives - Use char_indices() in subquery_alias_from_item to avoid slicing at invalid UTF-8 boundaries when multibyte characters appear - Reject matches preceded by " or ` in identifier_appears_in to avoid treating o."year_total" as an unqualified alias reference --- yardstick-rs/src/sql/measures.rs | 33 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 3397efb..0bce4bf 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5221,23 +5221,23 @@ fn subquery_alias_from_item(item: &str) -> Option { if !upper.contains("(SELECT ") { return None; } - // Find the last top-level " AS " (not inside parentheses) - let bytes = item.as_bytes(); + // Find the last top-level " AS " (not inside parentheses). + // Use char_indices to stay on valid UTF-8 boundaries. let mut depth: i32 = 0; let mut last_as_pos: Option = None; - let mut i = 0; - while i < item.len() { - match bytes[i] { - b'(' => depth += 1, - b')' => depth -= 1, - _ if depth == 0 => { - if i + 4 <= item.len() && upper[i..].starts_with(" AS ") { + for (i, ch) in item.char_indices() { + match ch { + '(' => depth += 1, + ')' => depth -= 1, + _ if depth == 0 && ch == ' ' => { + if upper[i..].starts_with(" AS ") + && item.is_char_boundary(i + 4) + { last_as_pos = Some(i + 4); } } _ => {} } - i += 1; } let as_pos = last_as_pos?; let alias = item[as_pos..].trim(); @@ -5267,7 +5267,7 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { let abs = search_from + pos; let before_ok = abs == 0 || { let c = stripped.as_bytes()[abs - 1]; - !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' + !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' && c != b'"' && c != b'`' }; let after_pos = abs + ident_upper.len(); let after_ok = after_pos >= stripped_upper.len() || { @@ -7762,6 +7762,9 @@ GROUP BY s.year"; assert!(!identifier_appears_in("/* year_total */ revenue", "year_total")); // Should still match real identifiers alongside literals assert!(identifier_appears_in("'literal' || year_total", "year_total")); + // Should not match quoted qualified names like o."year_total" + assert!(!identifier_appears_in(r#"o."year_total""#, "year_total")); + assert!(!identifier_appears_in(r#"o.`year_total`"#, "year_total")); } #[test] @@ -7771,4 +7774,12 @@ GROUP BY s.year"; let aliases = extract_subquery_aliases_from_select(sql); assert_eq!(aliases, vec!["x"]); } + + #[test] + fn test_subquery_alias_from_item_multibyte() { + // Should not panic on multibyte UTF-8 characters + let item = " (SELECT 1) || '\u{00e9}' AS year_total"; + let alias = subquery_alias_from_item(item); + assert_eq!(alias, Some("year_total".to_string())); + } } From e630dbb8832a61578f03837ea2199c44f16b1639 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 08:14:05 -0700 Subject: [PATCH 05/15] Fix uppercase byte-offset mismatch and bare quoted alias detection - subquery_alias_from_item: compare " AS " directly on item bytes instead of slicing the uppercased string, which can have different byte offsets for certain Unicode case mappings - identifier_appears_in: only reject " and ` before a match when preceded by . (qualified reference like o."alias"), allowing bare quoted identifiers like "year_total" to match correctly --- yardstick-rs/src/sql/measures.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 551c2e7..e71aa41 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5524,21 +5524,21 @@ fn extract_subquery_aliases_from_select(sql: &str) -> Vec { /// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an AS alias. fn subquery_alias_from_item(item: &str) -> Option { - let upper = item.to_uppercase(); - if !upper.contains("(SELECT ") { + if !item.to_uppercase().contains("(SELECT ") { return None; } // Find the last top-level " AS " (not inside parentheses). - // Use char_indices to stay on valid UTF-8 boundaries. + // Compare directly on `item` using case-insensitive matching to avoid + // byte-offset mismatches between `item` and its uppercase form. let mut depth: i32 = 0; let mut last_as_pos: Option = None; for (i, ch) in item.char_indices() { match ch { '(' => depth += 1, ')' => depth -= 1, - _ if depth == 0 && ch == ' ' => { - if upper[i..].starts_with(" AS ") - && item.is_char_boundary(i + 4) + ' ' if depth == 0 => { + if item[i..].len() >= 4 + && item[i..].as_bytes()[1..4].eq_ignore_ascii_case(b"AS ") { last_as_pos = Some(i + 4); } @@ -5574,7 +5574,13 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { let abs = search_from + pos; let before_ok = abs == 0 || { let c = stripped.as_bytes()[abs - 1]; - !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' && c != b'"' && c != b'`' + if c == b'"' || c == b'`' { + // Bare quoted identifier like "year_total" is OK; + // qualified like o."year_total" (dot before quote) is not. + abs < 2 || stripped.as_bytes()[abs - 2] != b'.' + } else { + !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' + } }; let after_pos = abs + ident_upper.len(); let after_ok = after_pos >= stripped_upper.len() || { @@ -8174,9 +8180,12 @@ GROUP BY s.year"; assert!(!identifier_appears_in("/* year_total */ revenue", "year_total")); // Should still match real identifiers alongside literals assert!(identifier_appears_in("'literal' || year_total", "year_total")); - // Should not match quoted qualified names like o."year_total" + // Should not match qualified quoted names like o."year_total" assert!(!identifier_appears_in(r#"o."year_total""#, "year_total")); assert!(!identifier_appears_in(r#"o.`year_total`"#, "year_total")); + // But bare quoted aliases like "year_total" should match + assert!(identifier_appears_in(r#""year_total""#, "year_total")); + assert!(identifier_appears_in(r#"`year_total`"#, "year_total")); } #[test] From 7d6f736c00f65328c2f3aa37cb7736d60e378504 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 08:31:49 -0700 Subject: [PATCH 06/15] Fix char boundary advance and handle whitespace around AS aliases - identifier_appears_in: advance search_from to the next valid char boundary after a non-matching hit, preventing panics on non-ASCII text - subquery_alias_from_item: match AS with any surrounding whitespace (tabs, newlines, multiple spaces), not just single spaces --- yardstick-rs/src/sql/measures.rs | 62 ++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index e71aa41..c2c565f 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5527,27 +5527,42 @@ fn subquery_alias_from_item(item: &str) -> Option { if !item.to_uppercase().contains("(SELECT ") { return None; } - // Find the last top-level " AS " (not inside parentheses). - // Compare directly on `item` using case-insensitive matching to avoid - // byte-offset mismatches between `item` and its uppercase form. + // Find the last top-level whitespace-AS-whitespace (not inside parentheses). + // Handles spaces, tabs, and newlines around AS. + let bytes = item.as_bytes(); let mut depth: i32 = 0; - let mut last_as_pos: Option = None; - for (i, ch) in item.char_indices() { - match ch { - '(' => depth += 1, - ')' => depth -= 1, - ' ' if depth == 0 => { - if item[i..].len() >= 4 - && item[i..].as_bytes()[1..4].eq_ignore_ascii_case(b"AS ") + let mut last_as_end: Option = None; + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + c if depth == 0 && c.is_ascii_whitespace() => { + // Skip leading whitespace + let ws_start = i; + while i < bytes.len() && bytes[i].is_ascii_whitespace() { + i += 1; + } + // Check for "AS" followed by whitespace + if i + 2 <= bytes.len() + && bytes[i..i + 2].eq_ignore_ascii_case(b"AS") + && (i + 2 >= bytes.len() || bytes[i + 2].is_ascii_whitespace()) { - last_as_pos = Some(i + 4); + let _ = ws_start; + let mut end = i + 2; + while end < bytes.len() && bytes[end].is_ascii_whitespace() { + end += 1; + } + last_as_end = Some(end); } + continue; // already advanced i past whitespace } _ => {} } + i += 1; } - let as_pos = last_as_pos?; - let alias = item[as_pos..].trim(); + let as_end = last_as_end?; + let alias = item[as_end..].trim(); // Extract identifier: alphanumeric, underscore, or quoted let alias = alias .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '"' && c != '`') @@ -5590,7 +5605,13 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { if before_ok && after_ok { return true; } + // Advance past the current match start, staying on a char boundary search_from = abs + 1; + while search_from < stripped_upper.len() + && !stripped_upper.is_char_boundary(search_from) + { + search_from += 1; + } } false } @@ -8204,6 +8225,19 @@ GROUP BY s.year"; assert_eq!(alias, Some("year_total".to_string())); } + #[test] + fn test_subquery_alias_from_item_whitespace_variants() { + // Tab between AS and alias + let item = " (SELECT 1) AS\tyear_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Newline between AS and alias + let item = " (SELECT 1) AS\nyear_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Multiple spaces + let item = " (SELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + } + #[test] fn test_is_expression_dim() { // Simple column references From 4ec199d84bb277ccb6b7b3143b9efd511b41c16a Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 08:39:50 -0700 Subject: [PATCH 07/15] Relax subquery detection and skip parenthesized alias matches - item_has_subquery: detect subqueries with whitespace after opening paren and CTE-style (WITH ... SELECT ...) patterns - identifier_appears_in: track parenthesis depth and only match at top level, preventing false positives like EXTRACT(YEAR FROM ...) triggering unnecessary query wrapping --- yardstick-rs/src/sql/measures.rs | 72 +++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index c2c565f..6943cce 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5522,9 +5522,32 @@ fn extract_subquery_aliases_from_select(sql: &str) -> Vec { aliases } +/// Check if text contains a parenthesized subquery: `(SELECT ...)` or `(WITH ... SELECT ...)`, +/// tolerating whitespace between `(` and the keyword. +fn item_has_subquery(item: &str) -> bool { + let bytes = item.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'(' { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + if j + 6 <= bytes.len() && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") { + return true; + } + if j + 4 <= bytes.len() && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") { + return true; + } + } + i += 1; + } + false +} + /// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an AS alias. fn subquery_alias_from_item(item: &str) -> Option { - if !item.to_uppercase().contains("(SELECT ") { + if !item_has_subquery(item) { return None; } // Find the last top-level whitespace-AS-whitespace (not inside parentheses). @@ -5577,35 +5600,48 @@ fn subquery_alias_from_item(item: &str) -> Option { } } -/// Check if an identifier appears as a whole word in text (case-insensitive), -/// skipping string literals and comments. +/// Check if an identifier appears as a top-level whole word in text (case-insensitive), +/// skipping string literals, comments, and content inside parentheses. fn identifier_appears_in(text: &str, ident: &str) -> bool { - // Strip string literals and comments before scanning for the identifier. let stripped = strip_literals_and_comments(text); let stripped_upper = stripped.to_uppercase(); let ident_upper = ident.to_uppercase(); + + // Pre-compute parenthesis depth at each byte position + let bytes = stripped.as_bytes(); + let mut depths = vec![0i32; bytes.len()]; + let mut depth: i32 = 0; + for i in 0..bytes.len() { + if bytes[i] == b'(' { + depth += 1; + } + depths[i] = depth; + if bytes[i] == b')' { + depth -= 1; + } + } + let mut search_from = 0; while let Some(pos) = stripped_upper[search_from..].find(&ident_upper) { let abs = search_from + pos; + // Only match at top level (outside parentheses) + let at_top_level = depths[abs] == 0; let before_ok = abs == 0 || { - let c = stripped.as_bytes()[abs - 1]; + let c = bytes[abs - 1]; if c == b'"' || c == b'`' { - // Bare quoted identifier like "year_total" is OK; - // qualified like o."year_total" (dot before quote) is not. - abs < 2 || stripped.as_bytes()[abs - 2] != b'.' + abs < 2 || bytes[abs - 2] != b'.' } else { !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' } }; let after_pos = abs + ident_upper.len(); let after_ok = after_pos >= stripped_upper.len() || { - let c = stripped.as_bytes()[after_pos]; + let c = bytes[after_pos]; !c.is_ascii_alphanumeric() && c != b'_' }; - if before_ok && after_ok { + if at_top_level && before_ok && after_ok { return true; } - // Advance past the current match start, staying on a char boundary search_from = abs + 1; while search_from < stripped_upper.len() && !stripped_upper.is_char_boundary(search_from) @@ -8207,6 +8243,11 @@ GROUP BY s.year"; // But bare quoted aliases like "year_total" should match assert!(identifier_appears_in(r#""year_total""#, "year_total")); assert!(identifier_appears_in(r#"`year_total`"#, "year_total")); + // Should not match inside parentheses (e.g., function arguments) + assert!(!identifier_appears_in("EXTRACT(YEAR FROM o.ts)", "year")); + assert!(!identifier_appears_in("COALESCE(year_total, 0)", "year_total")); + // But top-level references should still match + assert!(identifier_appears_in("year_total + COALESCE(x, 0)", "year_total")); } #[test] @@ -8236,6 +8277,15 @@ GROUP BY s.year"; // Multiple spaces let item = " (SELECT 1) AS year_total"; assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Space after opening paren: ( SELECT ...) + let item = " ( SELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Newline after opening paren + let item = " (\nSELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // CTE subquery: (WITH ... SELECT ...) + let item = " (WITH cte AS (SELECT 1) SELECT * FROM cte) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); } #[test] From f83ac55ebb7cf93fd4f8849f23bf18828aa58d77 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 08:50:50 -0700 Subject: [PATCH 08/15] Match aliases inside parenthesized ORDER BY and support implicit aliases - identifier_appears_in: remove depth=0 restriction so aliases inside COALESCE(year_total, 0) or (year_total) are correctly detected - subquery_alias_from_item: detect implicit aliases without AS keyword, e.g. (SELECT ...) year_total --- yardstick-rs/src/sql/measures.rs | 87 +++++++++++++++++++------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 6943cce..d1a931a 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5545,48 +5545,71 @@ fn item_has_subquery(item: &str) -> bool { false } -/// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an AS alias. +/// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an alias. +/// Supports both explicit (`AS alias`) and implicit (`(SELECT ...) alias`) forms. fn subquery_alias_from_item(item: &str) -> Option { if !item_has_subquery(item) { return None; } - // Find the last top-level whitespace-AS-whitespace (not inside parentheses). - // Handles spaces, tabs, and newlines around AS. + // Find the last top-level alias (not inside parentheses). let bytes = item.as_bytes(); let mut depth: i32 = 0; - let mut last_as_end: Option = None; + let mut last_alias_start: Option = None; let mut i = 0; while i < bytes.len() { match bytes[i] { b'(' => depth += 1, - b')' => depth -= 1, + b')' => { + depth -= 1; + if depth == 0 { + // After closing paren at depth 0, check for trailing alias + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + if j < bytes.len() { + // Check for explicit "AS" keyword + if j + 2 <= bytes.len() + && bytes[j..j + 2].eq_ignore_ascii_case(b"AS") + && (j + 2 >= bytes.len() || bytes[j + 2].is_ascii_whitespace()) + { + let mut end = j + 2; + while end < bytes.len() && bytes[end].is_ascii_whitespace() { + end += 1; + } + last_alias_start = Some(end); + } else if bytes[j].is_ascii_alphabetic() || bytes[j] == b'_' + || bytes[j] == b'"' || bytes[j] == b'`' + { + // Implicit alias (no AS keyword) + last_alias_start = Some(j); + } + } + } + } c if depth == 0 && c.is_ascii_whitespace() => { - // Skip leading whitespace - let ws_start = i; + // Also check for AS keyword in non-subquery trailing context while i < bytes.len() && bytes[i].is_ascii_whitespace() { i += 1; } - // Check for "AS" followed by whitespace if i + 2 <= bytes.len() && bytes[i..i + 2].eq_ignore_ascii_case(b"AS") && (i + 2 >= bytes.len() || bytes[i + 2].is_ascii_whitespace()) { - let _ = ws_start; let mut end = i + 2; while end < bytes.len() && bytes[end].is_ascii_whitespace() { end += 1; } - last_as_end = Some(end); + last_alias_start = Some(end); } - continue; // already advanced i past whitespace + continue; } _ => {} } i += 1; } - let as_end = last_as_end?; - let alias = item[as_end..].trim(); - // Extract identifier: alphanumeric, underscore, or quoted + let alias_start = last_alias_start?; + let alias = item[alias_start..].trim(); let alias = alias .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '"' && c != '`') .next() @@ -5600,32 +5623,18 @@ fn subquery_alias_from_item(item: &str) -> Option { } } -/// Check if an identifier appears as a top-level whole word in text (case-insensitive), -/// skipping string literals, comments, and content inside parentheses. +/// Check if an identifier appears as a whole word in text (case-insensitive), +/// skipping string literals and comments. Matches at any parenthesis depth +/// so that `COALESCE(year_total, 0)` still triggers wrapping. fn identifier_appears_in(text: &str, ident: &str) -> bool { let stripped = strip_literals_and_comments(text); let stripped_upper = stripped.to_uppercase(); let ident_upper = ident.to_uppercase(); - - // Pre-compute parenthesis depth at each byte position let bytes = stripped.as_bytes(); - let mut depths = vec![0i32; bytes.len()]; - let mut depth: i32 = 0; - for i in 0..bytes.len() { - if bytes[i] == b'(' { - depth += 1; - } - depths[i] = depth; - if bytes[i] == b')' { - depth -= 1; - } - } let mut search_from = 0; while let Some(pos) = stripped_upper[search_from..].find(&ident_upper) { let abs = search_from + pos; - // Only match at top level (outside parentheses) - let at_top_level = depths[abs] == 0; let before_ok = abs == 0 || { let c = bytes[abs - 1]; if c == b'"' || c == b'`' { @@ -5639,7 +5648,7 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { let c = bytes[after_pos]; !c.is_ascii_alphanumeric() && c != b'_' }; - if at_top_level && before_ok && after_ok { + if before_ok && after_ok { return true; } search_from = abs + 1; @@ -8243,10 +8252,10 @@ GROUP BY s.year"; // But bare quoted aliases like "year_total" should match assert!(identifier_appears_in(r#""year_total""#, "year_total")); assert!(identifier_appears_in(r#"`year_total`"#, "year_total")); - // Should not match inside parentheses (e.g., function arguments) - assert!(!identifier_appears_in("EXTRACT(YEAR FROM o.ts)", "year")); - assert!(!identifier_appears_in("COALESCE(year_total, 0)", "year_total")); - // But top-level references should still match + // Should match inside parentheses (e.g., COALESCE(year_total, 0)) + assert!(identifier_appears_in("COALESCE(year_total, 0)", "year_total")); + assert!(identifier_appears_in("(year_total)", "year_total")); + // Top-level references should still match assert!(identifier_appears_in("year_total + COALESCE(x, 0)", "year_total")); } @@ -8286,6 +8295,12 @@ GROUP BY s.year"; // CTE subquery: (WITH ... SELECT ...) let item = " (WITH cte AS (SELECT 1) SELECT * FROM cte) AS year_total"; assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Implicit alias (no AS keyword) + let item = " (SELECT 1) year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Quoted implicit alias + let item = r#" (SELECT 1) "year_total""#; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); } #[test] From d430c8a9544711fa59305701feaa3e70734c046a Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 09:05:02 -0700 Subject: [PATCH 09/15] Require token boundaries for SELECT/WITH subquery checks item_has_subquery now verifies the character after SELECT/WITH is not alphanumeric or underscore, preventing identifiers like selected_value or with_tax from being misclassified as subquery keywords. --- yardstick-rs/src/sql/measures.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index d1a931a..a1ddff4 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5533,10 +5533,16 @@ fn item_has_subquery(item: &str) -> bool { while j < bytes.len() && bytes[j].is_ascii_whitespace() { j += 1; } - if j + 6 <= bytes.len() && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") { + if j + 6 <= bytes.len() + && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") + && (j + 6 >= bytes.len() || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_') + { return true; } - if j + 4 <= bytes.len() && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") { + if j + 4 <= bytes.len() + && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") + && (j + 4 >= bytes.len() || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_') + { return true; } } From 7740e275cb518c0e8ab063b9a79e8ed9e51f9ab2 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 09:36:18 -0700 Subject: [PATCH 10/15] Skip string literals and comments when detecting subqueries item_has_subquery now skips single-quoted strings, line comments, and block comments so that text like '(SELECT ...)' inside a literal is not misclassified as a real subquery. --- yardstick-rs/src/sql/measures.rs | 65 ++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index a1ddff4..830780b 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5528,25 +5528,60 @@ fn item_has_subquery(item: &str) -> bool { let bytes = item.as_bytes(); let mut i = 0; while i < bytes.len() { - if bytes[i] == b'(' { - let mut j = i + 1; - while j < bytes.len() && bytes[j].is_ascii_whitespace() { - j += 1; + match bytes[i] { + b'\'' => { + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + i += 2; + } else { + i += 1; + break; + } + } else { + i += 1; + } + } } - if j + 6 <= bytes.len() - && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") - && (j + 6 >= bytes.len() || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_') - { - return true; + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + i += 1; + } } - if j + 4 <= bytes.len() - && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") - && (j + 4 >= bytes.len() || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_') - { - return true; + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } } + b'(' => { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + if j + 6 <= bytes.len() + && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") + && (j + 6 >= bytes.len() + || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_') + { + return true; + } + if j + 4 <= bytes.len() + && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") + && (j + 4 >= bytes.len() + || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_') + { + return true; + } + i += 1; + } + _ => i += 1, } - i += 1; } false } From afd6e291593be04afd1144d2284a73486cc12dad Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 16:52:56 -0700 Subject: [PATCH 11/15] Skip literals and comments in subquery_alias_from_item depth tracking Parentheses inside string literals or comments (e.g. (SELECT ...) || ')' AS year_total) no longer corrupt the depth counter, which previously caused the alias to be missed and the ORDER BY wrapper to not apply. --- yardstick-rs/src/sql/measures.rs | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 830780b..f4ad659 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5593,12 +5593,46 @@ fn subquery_alias_from_item(item: &str) -> Option { return None; } // Find the last top-level alias (not inside parentheses). + // Skip string literals and comments so quoted parens don't corrupt depth. let bytes = item.as_bytes(); let mut depth: i32 = 0; let mut last_alias_start: Option = None; let mut i = 0; while i < bytes.len() { match bytes[i] { + b'\'' => { + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + i += 2; + } else { + i += 1; + break; + } + } else { + i += 1; + } + } + continue; + } + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + i += 1; + } + continue; + } + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } + continue; + } b'(' => depth += 1, b')' => { depth -= 1; From ef2686efeaf527bdbed1749925a2a5de5b5ad829 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 18:08:44 -0700 Subject: [PATCH 12/15] Skip comments when splitting SELECT items in alias extraction extract_subquery_aliases_from_select now skips -- line comments and /* block comments */ so commas inside comments are not treated as SELECT item separators. --- yardstick-rs/src/sql/measures.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index f4ad659..3d18321 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5504,6 +5504,23 @@ fn extract_subquery_aliases_from_select(sql: &str) -> Vec { } } } + b'-' if i + 1 < select_text.len() && bytes[i + 1] == b'-' => { + while i < select_text.len() && bytes[i] != b'\n' { + i += 1; + } + continue; + } + b'/' if i + 1 < select_text.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < select_text.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } + continue; + } b',' if depth == 0 => { if let Some(alias) = subquery_alias_from_item(&select_text[item_start..i]) { aliases.push(alias); From 0303c34e2356b90a1f0b24d425e067115bbd9768 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 11 Apr 2026 20:39:58 -0700 Subject: [PATCH 13/15] Skip nested subqueries when scanning ORDER BY for alias references identifier_appears_in now strips (SELECT ...) and (WITH ...) blocks before scanning, so aliases appearing only inside nested subqueries in ORDER BY do not falsely trigger the query wrapper. --- yardstick-rs/src/sql/measures.rs | 57 +++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 3d18321..26d5997 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5716,13 +5716,15 @@ fn subquery_alias_from_item(item: &str) -> Option { } /// Check if an identifier appears as a whole word in text (case-insensitive), -/// skipping string literals and comments. Matches at any parenthesis depth -/// so that `COALESCE(year_total, 0)` still triggers wrapping. +/// skipping string literals, comments, and nested subqueries. +/// Matches inside regular function calls (e.g. `COALESCE(year_total, 0)`) +/// but not inside `(SELECT ...)` subquery scopes. fn identifier_appears_in(text: &str, ident: &str) -> bool { let stripped = strip_literals_and_comments(text); - let stripped_upper = stripped.to_uppercase(); + let cleaned = strip_nested_subqueries(&stripped); + let stripped_upper = cleaned.to_uppercase(); let ident_upper = ident.to_uppercase(); - let bytes = stripped.as_bytes(); + let bytes = cleaned.as_bytes(); let mut search_from = 0; while let Some(pos) = stripped_upper[search_from..].find(&ident_upper) { @@ -5807,6 +5809,48 @@ fn strip_literals_and_comments(text: &str) -> String { String::from_utf8(out).unwrap_or_else(|_| text.to_string()) } +/// Replace content inside `(SELECT ...)` and `(WITH ...)` subquery blocks with spaces. +/// Regular function-call parentheses are left intact. +fn strip_nested_subqueries(text: &str) -> String { + let bytes = text.as_bytes(); + let mut out = text.to_string().into_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'(' { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + let is_subquery = (j + 6 <= bytes.len() + && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") + && (j + 6 >= bytes.len() + || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_')) + || (j + 4 <= bytes.len() + && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") + && (j + 4 >= bytes.len() + || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_')); + if is_subquery { + // Blank out everything from ( to matching ) + let mut depth = 1; + out[i] = b' '; + i += 1; + while i < bytes.len() && depth > 0 { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + _ => {} + } + out[i] = b' '; + i += 1; + } + continue; + } + } + i += 1; + } + String::from_utf8(out).unwrap_or_else(|_| text.to_string()) +} + /// Expand AGGREGATE() with AT modifiers in SQL pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { let cte_expansion = expand_cte_queries(sql); @@ -8344,11 +8388,14 @@ GROUP BY s.year"; // But bare quoted aliases like "year_total" should match assert!(identifier_appears_in(r#""year_total""#, "year_total")); assert!(identifier_appears_in(r#"`year_total`"#, "year_total")); - // Should match inside parentheses (e.g., COALESCE(year_total, 0)) + // Should match inside regular function call parentheses assert!(identifier_appears_in("COALESCE(year_total, 0)", "year_total")); assert!(identifier_appears_in("(year_total)", "year_total")); // Top-level references should still match assert!(identifier_appears_in("year_total + COALESCE(x, 0)", "year_total")); + // Should NOT match inside nested subqueries + assert!(!identifier_appears_in("(SELECT year_total FROM aux)", "year_total")); + assert!(!identifier_appears_in("(SELECT year_total FROM aux), o.col", "year_total")); } #[test] From 07389a0c29bd62e626b963dd4eb84ce47d8ae820 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 12 Apr 2026 07:33:11 -0700 Subject: [PATCH 14/15] Use uppercased string for all byte indexing in identifier_appears_in All find() offsets and boundary checks now operate on the same uppercased string, eliminating byte-length mismatches when Unicode case expansion changes the string's byte layout. --- yardstick-rs/src/sql/measures.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 26d5997..7bc1476 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5722,12 +5722,14 @@ fn subquery_alias_from_item(item: &str) -> Option { fn identifier_appears_in(text: &str, ident: &str) -> bool { let stripped = strip_literals_and_comments(text); let cleaned = strip_nested_subqueries(&stripped); - let stripped_upper = cleaned.to_uppercase(); + // Perform all searching and boundary checks on the uppercased string + // to avoid byte-offset mismatches between the original and uppercased forms. + let upper = cleaned.to_uppercase(); let ident_upper = ident.to_uppercase(); - let bytes = cleaned.as_bytes(); + let bytes = upper.as_bytes(); let mut search_from = 0; - while let Some(pos) = stripped_upper[search_from..].find(&ident_upper) { + while let Some(pos) = upper[search_from..].find(&ident_upper) { let abs = search_from + pos; let before_ok = abs == 0 || { let c = bytes[abs - 1]; @@ -5738,7 +5740,7 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { } }; let after_pos = abs + ident_upper.len(); - let after_ok = after_pos >= stripped_upper.len() || { + let after_ok = after_pos >= bytes.len() || { let c = bytes[after_pos]; !c.is_ascii_alphanumeric() && c != b'_' }; @@ -5746,9 +5748,7 @@ fn identifier_appears_in(text: &str, ident: &str) -> bool { return true; } search_from = abs + 1; - while search_from < stripped_upper.len() - && !stripped_upper.is_char_boundary(search_from) - { + while search_from < bytes.len() && !upper.is_char_boundary(search_from) { search_from += 1; } } From fff8cbc0653d7faf5d1439da6330540936403ff6 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 12 Apr 2026 19:16:43 -0700 Subject: [PATCH 15/15] Derive ORDER BY expression start from actual keyword span Instead of assuming ORDER BY is exactly 8 bytes, skip past ORDER, any whitespace, and BY separately to correctly handle multi-whitespace formatting like ORDER\nBY. --- yardstick-rs/src/sql/measures.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 7bc1476..6c5c006 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -6283,13 +6283,25 @@ pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { if let Some(order_pos) = find_top_level_keyword(&result_sql, "ORDER BY", 0) { let subquery_aliases = extract_subquery_aliases_from_select(&result_sql); if !subquery_aliases.is_empty() { + // Skip past "ORDER", whitespace, and "BY" to find the expression start + let mut expr_start = order_pos + 5; // skip "ORDER" + while expr_start < result_sql.len() + && result_sql.as_bytes()[expr_start].is_ascii_whitespace() + { + expr_start += 1; + } + if expr_start + 2 <= result_sql.len() + && result_sql.as_bytes()[expr_start..expr_start + 2].eq_ignore_ascii_case(b"BY") + { + expr_start += 2; + } let order_end = find_first_top_level_keyword( &result_sql, - order_pos + 8, + expr_start, &["LIMIT", "OFFSET"], ) .unwrap_or(result_sql.len()); - let order_text = &result_sql[order_pos + 8..order_end]; + let order_text = &result_sql[expr_start..order_end]; let needs_wrap = subquery_aliases .iter() .any(|alias| identifier_appears_in(order_text, alias));