diff --git a/test/sql/measures.test b/test/sql/measures.test index d659851..99ebdf4 100644 --- a/test/sql/measures.test +++ b/test/sql/measures.test @@ -1772,6 +1772,49 @@ WHERE year = 2023; ---- 150.0 +# ORDER BY with named aggregate that expands to a subquery (GH #28) +query IIRR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AS revenue, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY revenue/year_total, year, region; +---- +2022 EU 50.0 150.0 +2023 EU 75.0 225.0 +2022 US 100.0 150.0 +2023 US 150.0 225.0 + +# ORDER BY with simple alias of subquery expression (GH #28) +query IIR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY year_total, year, region; +---- +2022 EU 150.0 +2022 US 150.0 +2023 EU 225.0 +2023 US 225.0 + +# ORDER BY DESC with subquery alias (GH #28) +query IIR +SEMANTIC SELECT + year, + region, + AGGREGATE(revenue) AT (ALL region) AS year_total +FROM sales_v +ORDER BY year_total DESC, year, region; +---- +2023 EU 225.0 +2023 US 225.0 +2022 EU 150.0 +2022 US 150.0 + # ============================================================================= # Test: Expression dimensions (issue #29) # ============================================================================= diff --git a/yardstick-rs/src/sql/measures.rs b/yardstick-rs/src/sql/measures.rs index 763a9a5..83c0f2f 100644 --- a/yardstick-rs/src/sql/measures.rs +++ b/yardstick-rs/src/sql/measures.rs @@ -5474,6 +5474,388 @@ fn validate_set_expression_requirements( None } +/// Extract alias names of SELECT items whose expressions contain subqueries. +fn extract_subquery_aliases_from_select(sql: &str) -> Vec { + let from_pos = find_top_level_keyword(sql, "FROM", 0).unwrap_or(sql.len()); + let select_start = find_top_level_keyword(sql, "SELECT", 0).map(|p| p + 6).unwrap_or(0); + let select_text = &sql[select_start..from_pos]; + + let mut aliases = Vec::new(); + let bytes = select_text.as_bytes(); + let mut depth: i32 = 0; + let mut item_start = 0; + let mut i = 0; + + while i < select_text.len() { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + b'\'' => { + i += 1; + while i < select_text.len() { + if bytes[i] == b'\'' { + if i + 1 < select_text.len() && bytes[i + 1] == b'\'' { + i += 2; + } else { + break; + } + } else { + i += 1; + } + } + } + b'-' if i + 1 < select_text.len() && bytes[i + 1] == b'-' => { + while i < select_text.len() && bytes[i] != b'\n' { + i += 1; + } + continue; + } + b'/' if i + 1 < select_text.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < select_text.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } + continue; + } + b',' if depth == 0 => { + if let Some(alias) = subquery_alias_from_item(&select_text[item_start..i]) { + aliases.push(alias); + } + item_start = i + 1; + } + _ => {} + } + i += 1; + } + // Last item + if let Some(alias) = subquery_alias_from_item(&select_text[item_start..]) { + aliases.push(alias); + } + + aliases +} + +/// Check if text contains a parenthesized subquery: `(SELECT ...)` or `(WITH ... SELECT ...)`, +/// tolerating whitespace between `(` and the keyword. +fn item_has_subquery(item: &str) -> bool { + let bytes = item.as_bytes(); + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'\'' => { + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + i += 2; + } else { + i += 1; + break; + } + } else { + i += 1; + } + } + } + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + i += 1; + } + } + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } + } + b'(' => { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + if j + 6 <= bytes.len() + && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") + && (j + 6 >= bytes.len() + || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_') + { + return true; + } + if j + 4 <= bytes.len() + && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") + && (j + 4 >= bytes.len() + || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_') + { + return true; + } + i += 1; + } + _ => i += 1, + } + } + false +} + +/// Check if a single SELECT item contains a `(SELECT ...)` subquery and has an alias. +/// Supports both explicit (`AS alias`) and implicit (`(SELECT ...) alias`) forms. +fn subquery_alias_from_item(item: &str) -> Option { + if !item_has_subquery(item) { + return None; + } + // Find the last top-level alias (not inside parentheses). + // Skip string literals and comments so quoted parens don't corrupt depth. + let bytes = item.as_bytes(); + let mut depth: i32 = 0; + let mut last_alias_start: Option = None; + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'\'' => { + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + i += 2; + } else { + i += 1; + break; + } + } else { + i += 1; + } + } + continue; + } + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + i += 1; + } + continue; + } + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + i += 2; + break; + } + i += 1; + } + continue; + } + b'(' => depth += 1, + b')' => { + depth -= 1; + if depth == 0 { + // After closing paren at depth 0, check for trailing alias + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + if j < bytes.len() { + // Check for explicit "AS" keyword + if j + 2 <= bytes.len() + && bytes[j..j + 2].eq_ignore_ascii_case(b"AS") + && (j + 2 >= bytes.len() || bytes[j + 2].is_ascii_whitespace()) + { + let mut end = j + 2; + while end < bytes.len() && bytes[end].is_ascii_whitespace() { + end += 1; + } + last_alias_start = Some(end); + } else if bytes[j].is_ascii_alphabetic() || bytes[j] == b'_' + || bytes[j] == b'"' || bytes[j] == b'`' + { + // Implicit alias (no AS keyword) + last_alias_start = Some(j); + } + } + } + } + c if depth == 0 && c.is_ascii_whitespace() => { + // Also check for AS keyword in non-subquery trailing context + while i < bytes.len() && bytes[i].is_ascii_whitespace() { + i += 1; + } + if i + 2 <= bytes.len() + && bytes[i..i + 2].eq_ignore_ascii_case(b"AS") + && (i + 2 >= bytes.len() || bytes[i + 2].is_ascii_whitespace()) + { + let mut end = i + 2; + while end < bytes.len() && bytes[end].is_ascii_whitespace() { + end += 1; + } + last_alias_start = Some(end); + } + continue; + } + _ => {} + } + i += 1; + } + let alias_start = last_alias_start?; + let alias = item[alias_start..].trim(); + let alias = alias + .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '"' && c != '`') + .next() + .unwrap_or(alias) + .trim_matches('"') + .trim_matches('`'); + if alias.is_empty() { + None + } else { + Some(alias.to_string()) + } +} + +/// Check if an identifier appears as a whole word in text (case-insensitive), +/// skipping string literals, comments, and nested subqueries. +/// Matches inside regular function calls (e.g. `COALESCE(year_total, 0)`) +/// but not inside `(SELECT ...)` subquery scopes. +fn identifier_appears_in(text: &str, ident: &str) -> bool { + let stripped = strip_literals_and_comments(text); + let cleaned = strip_nested_subqueries(&stripped); + // Perform all searching and boundary checks on the uppercased string + // to avoid byte-offset mismatches between the original and uppercased forms. + let upper = cleaned.to_uppercase(); + let ident_upper = ident.to_uppercase(); + let bytes = upper.as_bytes(); + + let mut search_from = 0; + while let Some(pos) = upper[search_from..].find(&ident_upper) { + let abs = search_from + pos; + let before_ok = abs == 0 || { + let c = bytes[abs - 1]; + if c == b'"' || c == b'`' { + // Scan backwards past whitespace to check for a dot (e.g. o . "alias") + let mut k = abs.saturating_sub(2); + while k > 0 && bytes[k].is_ascii_whitespace() { + k -= 1; + } + abs < 2 || bytes[k] != b'.' + } else { + !c.is_ascii_alphanumeric() && c != b'_' && c != b'.' + } + }; + let after_pos = abs + ident_upper.len(); + let after_ok = after_pos >= bytes.len() || { + let c = bytes[after_pos]; + !c.is_ascii_alphanumeric() && c != b'_' + }; + if before_ok && after_ok { + return true; + } + search_from = abs + 1; + while search_from < bytes.len() && !upper.is_char_boundary(search_from) { + search_from += 1; + } + } + false +} + +/// Replace string literals and comments with spaces to avoid false matches. +fn strip_literals_and_comments(text: &str) -> String { + let bytes = text.as_bytes(); + let mut out = text.to_string().into_bytes(); + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'\'' => { + out[i] = b' '; + i += 1; + while i < bytes.len() { + if bytes[i] == b'\'' { + if i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + } else { + out[i] = b' '; + i += 1; + break; + } + } else { + out[i] = b' '; + i += 1; + } + } + } + b'-' if i + 1 < bytes.len() && bytes[i + 1] == b'-' => { + while i < bytes.len() && bytes[i] != b'\n' { + out[i] = b' '; + i += 1; + } + } + b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'/' { + out[i] = b' '; + out[i + 1] = b' '; + i += 2; + break; + } + out[i] = b' '; + i += 1; + } + } + _ => i += 1, + } + } + String::from_utf8(out).unwrap_or_else(|_| text.to_string()) +} + +/// Replace content inside `(SELECT ...)` and `(WITH ...)` subquery blocks with spaces. +/// Regular function-call parentheses are left intact. +fn strip_nested_subqueries(text: &str) -> String { + let bytes = text.as_bytes(); + let mut out = text.to_string().into_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'(' { + let mut j = i + 1; + while j < bytes.len() && bytes[j].is_ascii_whitespace() { + j += 1; + } + let is_subquery = (j + 6 <= bytes.len() + && bytes[j..j + 6].eq_ignore_ascii_case(b"SELECT") + && (j + 6 >= bytes.len() + || !bytes[j + 6].is_ascii_alphanumeric() && bytes[j + 6] != b'_')) + || (j + 4 <= bytes.len() + && bytes[j..j + 4].eq_ignore_ascii_case(b"WITH") + && (j + 4 >= bytes.len() + || !bytes[j + 4].is_ascii_alphanumeric() && bytes[j + 4] != b'_')); + if is_subquery { + // Blank out everything from ( to matching ) + let mut depth = 1; + out[i] = b' '; + i += 1; + while i < bytes.len() && depth > 0 { + match bytes[i] { + b'(' => depth += 1, + b')' => depth -= 1, + _ => {} + } + out[i] = b' '; + i += 1; + } + continue; + } + } + i += 1; + } + String::from_utf8(out).unwrap_or_else(|_| text.to_string()) +} + /// Expand AGGREGATE() with AT modifiers in SQL pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { let cte_expansion = expand_cte_queries(sql); @@ -5901,6 +6283,41 @@ pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult { ); } + // If ORDER BY references an alias whose SELECT expression is a subquery, + // wrap the query to avoid DuckDB's "Alias has subquery" limitation. + if let Some(order_pos) = find_top_level_keyword(&result_sql, "ORDER BY", 0) { + let subquery_aliases = extract_subquery_aliases_from_select(&result_sql); + if !subquery_aliases.is_empty() { + // Skip past "ORDER", whitespace, and "BY" to find the expression start + let mut expr_start = order_pos + 5; // skip "ORDER" + while expr_start < result_sql.len() + && result_sql.as_bytes()[expr_start].is_ascii_whitespace() + { + expr_start += 1; + } + if expr_start + 2 <= result_sql.len() + && result_sql.as_bytes()[expr_start..expr_start + 2].eq_ignore_ascii_case(b"BY") + { + expr_start += 2; + } + let order_end = find_first_top_level_keyword( + &result_sql, + expr_start, + &["LIMIT", "OFFSET"], + ) + .unwrap_or(result_sql.len()); + let order_text = &result_sql[expr_start..order_end]; + let needs_wrap = subquery_aliases + .iter() + .any(|alias| identifier_appears_in(order_text, alias)); + if needs_wrap { + let trailing = result_sql[order_pos..].trim_end_matches(';').trim_end(); + let inner = result_sql[..order_pos].trim_end().trim_end_matches(';').trim_end(); + result_sql = format!("SELECT * FROM ({inner}) _q {trailing}"); + } + } + } + AggregateExpandResult { had_aggregate, expanded_sql: result_sql, @@ -7952,6 +8369,99 @@ GROUP BY s.year"; )); } + #[test] + fn test_extract_subquery_aliases_from_select() { + let sql = "SELECT year, region, SUM(revenue) AS revenue, (SELECT SUM(revenue) FROM t _inner WHERE _inner.year IS NOT DISTINCT FROM _outer.year) AS year_total FROM sales_v _outer GROUP BY year, region"; + let aliases = extract_subquery_aliases_from_select(sql); + assert_eq!(aliases, vec!["year_total"]); + } + + #[test] + fn test_extract_subquery_aliases_no_subquery() { + let sql = "SELECT year, SUM(revenue) AS revenue FROM sales_v GROUP BY year"; + let aliases = extract_subquery_aliases_from_select(sql); + assert!(aliases.is_empty()); + } + + #[test] + fn test_identifier_appears_in() { + assert!(identifier_appears_in("revenue/year_total", "year_total")); + assert!(identifier_appears_in("revenue/year_total", "revenue")); + assert!(!identifier_appears_in("revenue/year_total", "year")); + assert!(identifier_appears_in("year_total DESC", "year_total")); + assert!(identifier_appears_in(" year_total", "year_total")); + assert!(!identifier_appears_in("o.prodName", "year_total")); + // Should not match inside string literals + assert!(!identifier_appears_in("'year_total'", "year_total")); + assert!(!identifier_appears_in("'contains year_total inside'", "year_total")); + // Should not match inside comments + assert!(!identifier_appears_in("-- year_total\nrevenue", "year_total")); + assert!(!identifier_appears_in("/* year_total */ revenue", "year_total")); + // Should still match real identifiers alongside literals + assert!(identifier_appears_in("'literal' || year_total", "year_total")); + // Should not match qualified quoted names like o."year_total" + assert!(!identifier_appears_in(r#"o."year_total""#, "year_total")); + assert!(!identifier_appears_in(r#"o.`year_total`"#, "year_total")); + // Should not match spaced qualified quoted names like o . "year_total" + assert!(!identifier_appears_in(r#"o . "year_total""#, "year_total")); + assert!(!identifier_appears_in(r#"o . `year_total`"#, "year_total")); + // But bare quoted aliases like "year_total" should match + assert!(identifier_appears_in(r#""year_total""#, "year_total")); + assert!(identifier_appears_in(r#"`year_total`"#, "year_total")); + // Should match inside regular function call parentheses + assert!(identifier_appears_in("COALESCE(year_total, 0)", "year_total")); + assert!(identifier_appears_in("(year_total)", "year_total")); + // Top-level references should still match + assert!(identifier_appears_in("year_total + COALESCE(x, 0)", "year_total")); + // Should NOT match inside nested subqueries + assert!(!identifier_appears_in("(SELECT year_total FROM aux)", "year_total")); + assert!(!identifier_appears_in("(SELECT year_total FROM aux), o.col", "year_total")); + } + + #[test] + fn test_extract_subquery_aliases_with_string_in_item() { + // Comma inside a string literal should not split the item + let sql = "SELECT 'a,b' || (SELECT 1) AS x FROM t"; + let aliases = extract_subquery_aliases_from_select(sql); + assert_eq!(aliases, vec!["x"]); + } + + #[test] + fn test_subquery_alias_from_item_multibyte() { + // Should not panic on multibyte UTF-8 characters + let item = " (SELECT 1) || '\u{00e9}' AS year_total"; + let alias = subquery_alias_from_item(item); + assert_eq!(alias, Some("year_total".to_string())); + } + + #[test] + fn test_subquery_alias_from_item_whitespace_variants() { + // Tab between AS and alias + let item = " (SELECT 1) AS\tyear_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Newline between AS and alias + let item = " (SELECT 1) AS\nyear_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Multiple spaces + let item = " (SELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Space after opening paren: ( SELECT ...) + let item = " ( SELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Newline after opening paren + let item = " (\nSELECT 1) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // CTE subquery: (WITH ... SELECT ...) + let item = " (WITH cte AS (SELECT 1) SELECT * FROM cte) AS year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Implicit alias (no AS keyword) + let item = " (SELECT 1) year_total"; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + // Quoted implicit alias + let item = r#" (SELECT 1) "year_total""#; + assert_eq!(subquery_alias_from_item(item), Some("year_total".to_string())); + } + #[test] fn test_is_expression_dim() { // Simple column references