Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/dialect/generic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ impl Dialect for GenericDialect {
true
}

fn supports_multiline_comment_hints(&self) -> bool {
true
}

fn supports_user_host_grantee(&self) -> bool {
true
}
Expand Down
6 changes: 6 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,12 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports optimizer hints in multiline comments
/// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/`
fn supports_multiline_comment_hints(&self) -> bool {
false
}

/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
/// as an alias assignment operator, rather than a boolean expression.
/// For example: the following statements are equivalent for such a dialect:
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mysql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ impl Dialect for MySqlDialect {
true
}

/// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
fn supports_multiline_comment_hints(&self) -> bool {
true
}

fn parse_infix(
&self,
parser: &mut crate::parser::Parser,
Expand Down
106 changes: 101 additions & 5 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -945,10 +945,66 @@ impl<'a> Tokenizer<'a> {
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
let span = location.span_to(state.location());

buf.push(TokenWithSpan { token, span });
// Check if this is a multiline comment hint that should be expanded
match &token {
Token::Whitespace(Whitespace::MultiLineComment(comment))
if self.dialect.supports_multiline_comment_hints()
&& comment.starts_with('!') =>
{
// Re-tokenize the hints and add them to the buffer
self.tokenize_comment_hints(comment, span, buf)?;
}
_ => {
buf.push(TokenWithSpan { token, span });
}
}

location = state.location();
}
Ok(())
}

/// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
/// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
fn tokenize_comment_hints(
&self,
comment: &str,
span: Span,
buf: &mut Vec<TokenWithSpan>,
) -> Result<(), TokenizerError> {
// Strip the leading '!' and any version digits (e.g., "50110")
let hint_content = comment
.strip_prefix('!')
.unwrap_or(comment)
.trim_start_matches(|c: char| c.is_ascii_digit())
.trim();

// If there's no content after stripping, nothing to tokenize
if hint_content.is_empty() {
return Ok(());
}

// Create a new tokenizer for the hint content
let mut inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);

// Create a state for tracking position within the hint
let mut state = State {
peekable: hint_content.chars().peekable(),
line: span.start.line,
col: span.start.column,
};

// Tokenize the hint content and add tokens to the buffer
let mut location = state.location();
while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
let token_span = location.span_to(state.location());
buf.push(TokenWithSpan {
token,
span: token_span,
});
location = state.location();
}

Ok(())
}

Expand Down Expand Up @@ -980,7 +1036,7 @@ impl<'a> Tokenizer<'a> {

/// Get the next token or return None
fn next_token(
&self,
&mut self,
chars: &mut State,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
Expand Down Expand Up @@ -2227,13 +2283,12 @@ impl<'a> Tokenizer<'a> {
}

fn tokenize_multiline_comment(
&self,
&mut self,
chars: &mut State,
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
let supports_nested_comments = self.dialect.supports_nested_comments();

loop {
match chars.next() {
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
Expand Down Expand Up @@ -4218,6 +4273,47 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::make_word("y", None),
],
)
);
}

#[test]
fn tokenize_multiline_comment_with_comment_hint() {
let sql = String::from("0/*! word */1");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at their docs, I'm wondering if/how we support these examples?

SELECT /*! STRAIGHT_JOIN */ col1 FROM table1,table2
/*!50110 KEY_BLOCK_SIZE=1024 */
SELECT /*! BKA(t1) */ FROM T

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iffyio - Parsing the c_style comment unblocks sqlparser to not discard those as if they were a normal comment. Support for each hint will have to be added in a case by case bases. For example #2033 - MySQL adds a c-style comment if you run SHOW CREATE TABLE:

mysql> SHOW CREATE TABLE b;
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table | Create Table                                                                                                                                                    |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| b     | CREATE TABLE `b` (
  `ID` int DEFAULT NULL,
  `b` char(1) DEFAULT NULL /*!80023 INVISIBLE */
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0,008 sec)

Without the current patch, the invisible keyword will be discarded.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah so to clarify I'm rather wondering regarding the parser behavior for hints that aren't singe words e.g. /*!50110 KEY_BLOCK_SIZE=1024 */ - can we demonstrate the behavior with test cases for such scenarios?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iffyio thanks flagging this. I have fixed the issue and now we properly return individual tokens inside a C-style hint comment.


let dialect = MySqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Word(Word {
value: "word".to_string(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}

#[test]
fn tokenize_multiline_comment_with_comment_hint_and_version() {
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
let dialect = MySqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "KEY_BLOCK_SIZE".to_string(),
quote_style: None,
keyword: Keyword::KEY_BLOCK_SIZE,
}),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number("1024".to_string(), false),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
}