From c00291090c874079cdafba8cb7899e5f68da5b31 Mon Sep 17 00:00:00 2001 From: Michael Victor Zink Date: Fri, 16 Jan 2026 12:10:28 -0800 Subject: [PATCH] MySQL: Add support for DEFAULT CHARACTER SET in CREATE DATABASE Parse MySQL-style [DEFAULT] CHARACTER SET and [DEFAULT] COLLATE options in CREATE DATABASE statements. This adds two new fields to CreateDatabase: default_charset and default_collation. Supports the following syntax variants: - DEFAULT CHARACTER SET [=] charset_name - CHARACTER SET [=] charset_name - DEFAULT CHARSET [=] charset_name - CHARSET [=] charset_name - DEFAULT COLLATE [=] collation_name - COLLATE [=] collation_name --- src/ast/helpers/stmt_create_database.rs | 28 ++++++++ src/ast/mod.rs | 14 ++++ src/parser/mod.rs | 30 ++++++++ tests/sqlparser_mysql.rs | 93 +++++++++++++++++++++++++ 4 files changed, 165 insertions(+) diff --git a/src/ast/helpers/stmt_create_database.rs b/src/ast/helpers/stmt_create_database.rs index c718dbce1..e524228de 100644 --- a/src/ast/helpers/stmt_create_database.rs +++ b/src/ast/helpers/stmt_create_database.rs @@ -85,6 +85,14 @@ pub struct CreateDatabaseBuilder { pub storage_serialization_policy: Option, /// Optional comment attached to the database. pub comment: Option, + /// Optional default character set (MySQL). + /// + /// + pub default_charset: Option, + /// Optional default collation (MySQL). + /// + /// + pub default_collation: Option, /// Optional catalog sync configuration. pub catalog_sync: Option, /// Optional catalog sync namespace mode. @@ -120,6 +128,8 @@ impl CreateDatabaseBuilder { default_ddl_collation: None, storage_serialization_policy: None, comment: None, + default_charset: None, + default_collation: None, catalog_sync: None, catalog_sync_namespace_mode: None, catalog_sync_namespace_flatten_delimiter: None, @@ -218,6 +228,18 @@ impl CreateDatabaseBuilder { self } + /// Set the default character set for the database. + pub fn default_charset(mut self, default_charset: Option) -> Self { + self.default_charset = default_charset; + self + } + + /// Set the default collation for the database. + pub fn default_collation(mut self, default_collation: Option) -> Self { + self.default_collation = default_collation; + self + } + /// Set the catalog sync for the database. pub fn catalog_sync(mut self, catalog_sync: Option) -> Self { self.catalog_sync = catalog_sync; @@ -272,6 +294,8 @@ impl CreateDatabaseBuilder { default_ddl_collation: self.default_ddl_collation, storage_serialization_policy: self.storage_serialization_policy, comment: self.comment, + default_charset: self.default_charset, + default_collation: self.default_collation, catalog_sync: self.catalog_sync, catalog_sync_namespace_mode: self.catalog_sync_namespace_mode, catalog_sync_namespace_flatten_delimiter: self.catalog_sync_namespace_flatten_delimiter, @@ -302,6 +326,8 @@ impl TryFrom for CreateDatabaseBuilder { default_ddl_collation, storage_serialization_policy, comment, + default_charset, + default_collation, catalog_sync, catalog_sync_namespace_mode, catalog_sync_namespace_flatten_delimiter, @@ -323,6 +349,8 @@ impl TryFrom for CreateDatabaseBuilder { default_ddl_collation, storage_serialization_policy, comment, + default_charset, + default_collation, catalog_sync, catalog_sync_namespace_mode, catalog_sync_namespace_flatten_delimiter, diff --git a/src/ast/mod.rs b/src/ast/mod.rs index ce5a67e12..b0d878fc4 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -4226,6 +4226,10 @@ pub enum Statement { storage_serialization_policy: Option, /// Optional comment. comment: Option, + /// Optional default character set (MySQL). + default_charset: Option, + /// Optional default collation (MySQL). + default_collation: Option, /// Optional catalog sync identifier. catalog_sync: Option, /// Catalog sync namespace mode. @@ -5080,6 +5084,8 @@ impl fmt::Display for Statement { default_ddl_collation, storage_serialization_policy, comment, + default_charset, + default_collation, catalog_sync, catalog_sync_namespace_mode, catalog_sync_namespace_flatten_delimiter, @@ -5139,6 +5145,14 @@ impl fmt::Display for Statement { write!(f, " COMMENT = '{comment}'")?; } + if let Some(charset) = default_charset { + write!(f, " DEFAULT CHARACTER SET {charset}")?; + } + + if let Some(collation) = default_collation { + write!(f, " DEFAULT COLLATE {collation}")?; + } + if let Some(sync) = catalog_sync { write!(f, " CATALOG_SYNC = '{sync}'")?; } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5fa224f97..e126729d4 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -5330,6 +5330,34 @@ impl<'a> Parser<'a> { None }; + // Parse MySQL-style [DEFAULT] CHARACTER SET and [DEFAULT] COLLATE options + // + // Note: The docs only mention `CHARACTER SET`, but `CHARSET` is also supported. + // Furthermore, MySQL will only accept one character set, raising an error if there is more + // than one, but will accept multiple collations and use the last one. + // + // + let mut default_charset = None; + let mut default_collation = None; + loop { + let has_default = self.parse_keyword(Keyword::DEFAULT); + if default_charset.is_none() && self.parse_keywords(&[Keyword::CHARACTER, Keyword::SET]) + || self.parse_keyword(Keyword::CHARSET) + { + let _ = self.consume_token(&Token::Eq); + default_charset = Some(self.parse_identifier()?.value); + } else if self.parse_keyword(Keyword::COLLATE) { + let _ = self.consume_token(&Token::Eq); + default_collation = Some(self.parse_identifier()?.value); + } else if has_default { + // DEFAULT keyword not followed by CHARACTER SET, CHARSET, or COLLATE + self.prev_token(); + break; + } else { + break; + } + } + Ok(Statement::CreateDatabase { db_name, if_not_exists: ine, @@ -5346,6 +5374,8 @@ impl<'a> Parser<'a> { default_ddl_collation: None, storage_serialization_policy: None, comment: None, + default_charset, + default_collation, catalog_sync: None, catalog_sync_namespace_mode: None, catalog_sync_namespace_flatten_delimiter: None, diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 2c942798c..56d5c295b 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -4432,3 +4432,96 @@ fn test_optimizer_hints() { DELETE /*+ foobar */ FROM table_name", ); } + +#[test] +fn parse_create_database_with_charset() { + // Test DEFAULT CHARACTER SET with = sign + mysql_and_generic().verified_stmt("CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4"); + + // Test DEFAULT CHARACTER SET without = sign (normalized form) + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb DEFAULT CHARACTER SET = utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4", + ); + + // Test CHARACTER SET without DEFAULT + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb CHARACTER SET utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4", + ); + + // Test CHARSET shorthand + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb CHARSET utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4", + ); + + // Test DEFAULT CHARSET shorthand + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb DEFAULT CHARSET utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4", + ); + + // Test DEFAULT COLLATE + mysql_and_generic().verified_stmt("CREATE DATABASE mydb DEFAULT COLLATE utf8mb4_unicode_ci"); + + // Test COLLATE without DEFAULT + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb COLLATE utf8mb4_unicode_ci", + "CREATE DATABASE mydb DEFAULT COLLATE utf8mb4_unicode_ci", + ); + + // Test both CHARACTER SET and COLLATE together + mysql_and_generic().verified_stmt( + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_unicode_ci", + ); + + // Test IF NOT EXISTS with CHARACTER SET + mysql_and_generic() + .verified_stmt("CREATE DATABASE IF NOT EXISTS mydb DEFAULT CHARACTER SET utf16"); + + // Test the exact syntax from the issue + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE IF NOT EXISTS noria DEFAULT CHARACTER SET = utf16", + "CREATE DATABASE IF NOT EXISTS noria DEFAULT CHARACTER SET utf16", + ); +} + +#[test] +fn parse_create_database_with_charset_errors() { + // Missing charset name after CHARACTER SET + assert!(mysql_and_generic() + .parse_sql_statements("CREATE DATABASE mydb DEFAULT CHARACTER SET") + .is_err()); + + // Missing charset name after CHARSET + assert!(mysql_and_generic() + .parse_sql_statements("CREATE DATABASE mydb CHARSET") + .is_err()); + + // Missing collation name after COLLATE + assert!(mysql_and_generic() + .parse_sql_statements("CREATE DATABASE mydb DEFAULT COLLATE") + .is_err()); + + // Equals sign but no value + assert!(mysql_and_generic() + .parse_sql_statements("CREATE DATABASE mydb CHARACTER SET =") + .is_err()); +} + +#[test] +fn parse_create_database_with_charset_option_ordering() { + // MySQL allows COLLATE before CHARACTER SET - output is normalized to CHARACTER SET first + // (matches MySQL's own SHOW CREATE DATABASE output order) + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb DEFAULT COLLATE utf8mb4_unicode_ci DEFAULT CHARACTER SET utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_unicode_ci", + ); + + // COLLATE first without DEFAULT keywords + mysql_and_generic().one_statement_parses_to( + "CREATE DATABASE mydb COLLATE utf8mb4_unicode_ci CHARACTER SET utf8mb4", + "CREATE DATABASE mydb DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_unicode_ci", + ); +}