From 1727e7f8ab6de12cc39e64952d8a208e8fb33d31 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 5 Dec 2025 00:26:04 +0530 Subject: [PATCH] GenericDialect: support colon operator for JsonAccess - Port JsonAccess colon operator from Snowflake to Generic dialect - This will be used in variant data type support in Datafusion - see discussion in https://github.com/datafusion-contrib/datafusion-variant/issues/2 --- src/dialect/mod.rs | 7 ++ src/dialect/mssql.rs | 9 +++ src/dialect/postgresql.rs | 2 + tests/sqlparser_common.rs | 123 +++++++++++++++++++++++++++++++++++ tests/sqlparser_snowflake.rs | 115 +------------------------------- 5 files changed, 143 insertions(+), 113 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 83c6da482..bca5865f3 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -759,6 +759,13 @@ pub trait Dialect: Debug + Any { Token::DoubleColon | Token::ExclamationMark | Token::LBracket | Token::CaretAt => { Ok(p!(DoubleColon)) } + Token::Colon => match parser.peek_nth_token(1).token { + // When colon is followed by a string or a number, it's usually in MAP syntax. + Token::SingleQuotedString(_) | Token::Number(_, _) => Ok(self.prec_unknown()), + // In other cases, it's used in semi-structured data traversal like in variant or JSON + // string columns. See `JsonAccess`. + _ => Ok(p!(Pipe)), + }, Token::Arrow | Token::LongArrow | Token::HashArrow diff --git a/src/dialect/mssql.rs b/src/dialect/mssql.rs index e1902b389..0beaa3434 100644 --- a/src/dialect/mssql.rs +++ b/src/dialect/mssql.rs @@ -148,6 +148,15 @@ impl Dialect for MsSqlDialect { None } } + + fn get_next_precedence(&self, parser: &Parser) -> Option> { + let token = parser.peek_token(); + match token.token { + // lowest prec to prevent it from turning into a binary op + Token::Colon => Some(Ok(self.prec_unknown())), + _ => None, + } + } } impl MsSqlDialect { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index e861cc515..991233fb8 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -136,6 +136,8 @@ impl Dialect for PostgreSqlDialect { | Token::ShiftRight | Token::ShiftLeft | Token::CustomBinaryOperator(_) => Some(Ok(PG_OTHER_PREC)), + // lowest prec to prevent it from turning into a binary op + Token::Colon => Some(Ok(self.prec_unknown())), _ => None, } } diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index ccad67e39..eef333b63 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -17905,3 +17905,126 @@ fn test_parse_set_session_authorization() { })) ); } + +// https://docs.snowflake.com/en/user-guide/querying-semistructured +#[test] +fn parse_semi_structured_data_traversal() { + let dialects = TestedDialects::new(vec![ + Box::new(GenericDialect {}), + Box::new(SnowflakeDialect {}), + ]); + + // most basic case + let sql = "SELECT a:b FROM t"; + let select = dialects.verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "b".to_owned(), + quoted: false + }] + }, + }), + select.projection[0] + ); + + // identifier can be quoted + let sql = r#"SELECT a:"my long object key name" FROM t"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "my long object key name".to_owned(), + quoted: true + }] + }, + }), + select.projection[0] + ); + + dialects.verified_stmt("SELECT a:b::INT FROM t"); + + // unquoted keywords are permitted in the object key + let sql = "SELECT a:select, a:from FROM t"; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![ + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "select".to_owned(), + quoted: false + }] + }, + }), + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "from".to_owned(), + quoted: false + }] + }, + }) + ], + select.projection + ); + + // multiple levels can be traversed + // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation + let sql = r#"SELECT a:foo."bar".baz"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![ + JsonPathElem::Dot { + key: "foo".to_owned(), + quoted: false, + }, + JsonPathElem::Dot { + key: "bar".to_owned(), + quoted: true, + }, + JsonPathElem::Dot { + key: "baz".to_owned(), + quoted: false, + } + ] + }, + })], + select.projection + ); + + // dot and bracket notation can be mixed (starting with : case) + // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation + let sql = r#"SELECT a:foo[0].bar"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![ + JsonPathElem::Dot { + key: "foo".to_owned(), + quoted: false, + }, + JsonPathElem::Bracket { + key: Expr::value(number("0")), + }, + JsonPathElem::Dot { + key: "bar".to_owned(), + quoted: false, + } + ] + }, + })], + select.projection + ); +} diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index 22a632666..1bbd27f36 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -1265,37 +1265,8 @@ fn parse_lateral_flatten() { // https://docs.snowflake.com/en/user-guide/querying-semistructured #[test] fn parse_semi_structured_data_traversal() { - // most basic case - let sql = "SELECT a:b FROM t"; - let select = snowflake().verified_only_select(sql); - assert_eq!( - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "b".to_owned(), - quoted: false - }] - }, - }), - select.projection[0] - ); - - // identifier can be quoted - let sql = r#"SELECT a:"my long object key name" FROM t"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "my long object key name".to_owned(), - quoted: true - }] - }, - }), - select.projection[0] - ); + // see `tests/sqlparser_common.rs` -> `parse_semi_structured_data_traversal` for more test + // cases. This test only has Snowflake-specific syntax like array access. // expressions are allowed in bracket notation let sql = r#"SELECT a[2 + 2] FROM t"#; @@ -1316,88 +1287,6 @@ fn parse_semi_structured_data_traversal() { select.projection[0] ); - snowflake().verified_stmt("SELECT a:b::INT FROM t"); - - // unquoted keywords are permitted in the object key - let sql = "SELECT a:select, a:from FROM t"; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![ - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "select".to_owned(), - quoted: false - }] - }, - }), - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "from".to_owned(), - quoted: false - }] - }, - }) - ], - select.projection - ); - - // multiple levels can be traversed - // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation - let sql = r#"SELECT a:foo."bar".baz"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![ - JsonPathElem::Dot { - key: "foo".to_owned(), - quoted: false, - }, - JsonPathElem::Dot { - key: "bar".to_owned(), - quoted: true, - }, - JsonPathElem::Dot { - key: "baz".to_owned(), - quoted: false, - } - ] - }, - })], - select.projection - ); - - // dot and bracket notation can be mixed (starting with : case) - // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation - let sql = r#"SELECT a:foo[0].bar"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![ - JsonPathElem::Dot { - key: "foo".to_owned(), - quoted: false, - }, - JsonPathElem::Bracket { - key: Expr::value(number("0")), - }, - JsonPathElem::Dot { - key: "bar".to_owned(), - quoted: false, - } - ] - }, - })], - select.projection - ); - // dot and bracket notation can be mixed (starting with bracket case) // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation let sql = r#"SELECT a[0].foo.bar"#;