Skip to content

Commit

Permalink
Optimize HLSL Lexer Performance (#249)
Browse files Browse the repository at this point in the history
* Update DirectxLexer.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the tests

* Fix tokenization method in HLSLLexer to use tokenize() instead of tokens

* test new merge action

* Fix GitHub Actions workflow to handle branch names correctly during push

* Refactor GitHub Actions workflow to fetch and merge PR branch with main

* Update GitHub Actions workflow to use ACCESS_TOKEN instead of GITHUB_TOKEN for pushing changes

* Update GitHub Actions workflow to use CROSSGL_TOKEN for authentication

* Update GitHub Actions workflow to push changes using CROSSGL_TOKEN for authentication

* Revert changes to workflow, will fix in different PR

---------

Co-authored-by: Nripesh Niketan <[email protected]>
  • Loading branch information
InannaxX07 and NripeshN authored Jan 3, 2025
1 parent cfd399e commit f933145
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 98 deletions.
209 changes: 114 additions & 95 deletions crosstl/backend/DirectX/DirectxLexer.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,11 @@
import re
from typing import Iterator, Tuple, List

TOKENS = [
("COMMENT_SINGLE", r"//.*"),
("COMMENT_MULTI", r"/\*[\s\S]*?\*/"),
("INCLUDE", r"\#include\b"),
("STRUCT", r"\bstruct\b"),
("CBUFFER", r"\bcbuffer\b"),
("TEXTURE2D", r"\bTexture2D\b"),
("SAMPLER_STATE", r"\bSamplerState\b"),
("FVECTOR", r"\bfloat[2-4]\b"),
("FLOAT", r"\bfloat\b"),
("DOUBLE", r"\bdouble\b"),
("INT", r"\bint\b"),
("UINT", r"\buint\b"),
("BOOL", r"\bbool\b"),
("MATRIX", r"\bfloat[2-4]x[2-4]\b"),
("VOID", r"\bvoid\b"),
("RETURN", r"\breturn\b"),
("IF", r"\bif\b"),
("ELSE_IF", r"\belse\sif\b"),
("ELSE", r"\belse\b"),
("FOR", r"\bfor\b"),
("WHILE", r"\bwhile\b"),
("DO", r"\bdo\b"),
("REGISTER", r"\bregister\b"),
("IDENTIFIER", r"[a-zA-Z_][a-zA-Z0-9_]*"),
("NUMBER", r"\d+(\.\d+)?"),
("LBRACE", r"\{"),
("RBRACE", r"\}"),
("LPAREN", r"\("),
("RPAREN", r"\)"),
("LBRACKET", r"\["),
("RBRACKET", r"\]"),
("SEMICOLON", r";"),
("COMMA", r","),
("COLON", r":"),
("QUESTION", r"\?"),
("SHIFT_LEFT", r"<<"),
("SHIFT_RIGHT", r">>"),
("LESS_EQUAL", r"<="),
("GREATER_EQUAL", r">="),
("LESS_THAN", r"<"),
("GREATER_THAN", r">"),
("EQUAL", r"=="),
("NOT_EQUAL", r"!="),
("PLUS_EQUALS", r"\+="),
("MINUS_EQUALS", r"-="),
("MULTIPLY_EQUALS", r"\*="),
("DIVIDE_EQUALS", r"/="),
("ASSIGN_XOR", r"\^="),
("ASSIGN_OR", r"\|="),
("ASSIGN_AND", r"\&="),
("BITWISE_XOR", r"\^"),
("LOGICAL_AND", r"&&"),
("LOGICAL_OR", r"\|\|"),
("BITWISE_OR", r"\|"),
("DOT", r"\."),
("MULTIPLY", r"\*"),
("DIVIDE", r"/"),
("PLUS", r"\+"),
("MINUS", r"-"),
("EQUALS", r"="),
("WHITESPACE", r"\s+"),
("STRING", r"\"[^\"]*\""),
("SWITCH", r"\bswitch\b"),
("CASE", r"\bcase\b"),
("DEFAULT", r"\bdefault\b"),
("BREAK", r"\bbreak\b"),
("MOD", r"%"),
]

# using sets for faster lookup
SKIP_TOKENS = {"WHITESPACE", "COMMENT_SINGLE", "COMMENT_MULTI"}

# define keywords dictionary
KEYWORDS = {
"struct": "STRUCT",
"cbuffer": "CBUFFER",
Expand Down Expand Up @@ -97,38 +33,121 @@
"break": "BREAK",
}

# use tuple for immutable token types that won't change
TOKENS = tuple(
[
("COMMENT_SINGLE", r"//.*"),
("COMMENT_MULTI", r"/\*[\s\S]*?\*/"),
("INCLUDE", r"\#include\b"),
("STRUCT", r"\bstruct\b"),
("CBUFFER", r"\bcbuffer\b"),
("TEXTURE2D", r"\bTexture2D\b"),
("SAMPLER_STATE", r"\bSamplerState\b"),
("FVECTOR", r"\bfloat[2-4]\b"),
("FLOAT", r"\bfloat\b"),
("DOUBLE", r"\bdouble\b"),
("INT", r"\bint\b"),
("UINT", r"\buint\b"),
("BOOL", r"\bbool\b"),
("MATRIX", r"\bfloat[2-4]x[2-4]\b"),
("VOID", r"\bvoid\b"),
("RETURN", r"\breturn\b"),
("IF", r"\bif\b"),
("ELSE_IF", r"\belse\sif\b"),
("ELSE", r"\belse\b"),
("FOR", r"\bfor\b"),
("WHILE", r"\bwhile\b"),
("DO", r"\bdo\b"),
("REGISTER", r"\bregister\b"),
("IDENTIFIER", r"[a-zA-Z_][a-zA-Z0-9_]*"),
("NUMBER", r"\d+(\.\d+)?"),
("LBRACE", r"\{"),
("RBRACE", r"\}"),
("LPAREN", r"\("),
("RPAREN", r"\)"),
("LBRACKET", r"\["),
("RBRACKET", r"\]"),
("SEMICOLON", r";"),
("COMMA", r","),
("COLON", r":"),
("QUESTION", r"\?"),
("SHIFT_LEFT", r"<<"),
("SHIFT_RIGHT", r">>"),
("LESS_EQUAL", r"<="),
("GREATER_EQUAL", r">="),
("LESS_THAN", r"<"),
("GREATER_THAN", r">"),
("EQUAL", r"=="),
("NOT_EQUAL", r"!="),
("PLUS_EQUALS", r"\+="),
("MINUS_EQUALS", r"-="),
("MULTIPLY_EQUALS", r"\*="),
("DIVIDE_EQUALS", r"/="),
("ASSIGN_XOR", r"\^="),
("ASSIGN_OR", r"\|="),
("ASSIGN_AND", r"\&="),
("BITWISE_XOR", r"\^"),
("LOGICAL_AND", r"&&"),
("LOGICAL_OR", r"\|\|"),
("BITWISE_OR", r"\|"),
("DOT", r"\."),
("MULTIPLY", r"\*"),
("DIVIDE", r"/"),
("PLUS", r"\+"),
("MINUS", r"-"),
("EQUALS", r"="),
("WHITESPACE", r"\s+"),
("STRING", r"\"[^\"]*\""),
("SWITCH", r"\bswitch\b"),
("CASE", r"\bcase\b"),
("DEFAULT", r"\bdefault\b"),
("BREAK", r"\bbreak\b"),
("MOD", r"%"),
]
)


class HLSLLexer:
def __init__(self, code):
def __init__(self, code: str):
self._token_patterns = [(name, re.compile(pattern)) for name, pattern in TOKENS]
self.code = code
self.tokens = []
self.tokenize()
self._length = len(code)

def tokenize(self):
pos = 0
while pos < len(self.code):
match = None
def tokenize(self) -> List[Tuple[str, str]]:
# tokenize the input code and return list of tokens
return list(self.token_generator())

for token_type, pattern in TOKENS:
regex = re.compile(pattern)

match = regex.match(self.code, pos)
if match:
text = match.group(0)
if token_type == "IDENTIFIER" and text in KEYWORDS:
token_type = KEYWORDS[text]
if token_type not in [
"WHITESPACE",
"COMMENT_SINGLE",
"COMMENT_MULTI",
]:
token = (token_type, text)
self.tokens.append(token)
pos = match.end(0)
break
if not match:
def token_generator(self) -> Iterator[Tuple[str, str]]:
# function that yields tokens one at a time
pos = 0
while pos < self._length:
token = self._next_token(pos)
if token is None:
raise SyntaxError(
f"Illegal character '{self.code[pos]}' at position {pos}"
)
new_pos, token_type, text = token

if token_type == "IDENTIFIER" and text in KEYWORDS:
token_type = KEYWORDS[text]

if token_type not in SKIP_TOKENS:
yield (token_type, text)

pos = new_pos

yield ("EOF", "")

def _next_token(self, pos: int) -> Tuple[int, str, str]:
# find the next token starting at the given position
for token_type, pattern in self._token_patterns:
match = pattern.match(self.code, pos)
if match:
return match.end(0), token_type, match.group(0)
return None

self.tokens.append(("EOF", ""))
@classmethod
def from_file(cls, filepath: str, chunk_size: int = 8192) -> "HLSLLexer":
# create a lexer instance from a file, reading in chunks
with open(filepath, "r") as f:
return cls(f.read())
2 changes: 1 addition & 1 deletion tests/test_backend/test_directx/test_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def generate_code(ast_node):
def tokenize_code(code: str) -> List:
"""Helper function to tokenize code."""
lexer = HLSLLexer(code)
return lexer.tokens
return lexer.tokenize()


def parse_code(tokens: List):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_backend/test_directx/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def tokenize_code(code: str) -> List:
"""Helper function to tokenize code."""
lexer = HLSLLexer(code)
return lexer.tokens
return lexer.tokenize()


def test_struct_tokenization():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_backend/test_directx/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def parse_code(tokens: List):
def tokenize_code(code: str) -> List:
"""Helper function to tokenize code."""
lexer = HLSLLexer(code)
return lexer.tokens
return lexer.tokenize()


def test_struct_parsing():
Expand Down

0 comments on commit f933145

Please sign in to comment.