Replace library functions(isalpha & isspace)#1042
Replace library functions(isalpha & isspace)#1042Sulfoxide1819 wants to merge 3 commits intoleethomason:masterfrom
Conversation
There was a problem hiding this comment.
Pull request overview
This PR replaces standard library functions isalpha and isspace with custom bit mask implementations to achieve significant performance improvements (11-15% on x86, ~20% on RISC-V). The optimization focuses on US-ASCII character processing using efficient bit manipulation operations.
Key changes:
- Introduced new
IsSpacefunction using a 64-bit bitmask to identify whitespace characters (tab, newline, vertical tab, form feed, carriage return, space) - Rewrote
IsNameStartCharto use a 4-element bitmask array covering the full 8-bit character range - Added comprehensive unit tests for
IsNameStartCharcovering edge cases and boundary values
Reviewed changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated 4 comments.
| File | Description |
|---|---|
| tinyxml2.h | Implements bit mask-based character classification functions replacing library calls |
| xmltest.cpp | Adds unit tests for IsNameStartChar validating correct behavior for special characters, alphabetic ranges, and high-order bytes |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| inline static bool IsSpace( unsigned char ch ) { | ||
| static constexpr uint64_t mask = | ||
| 1ULL << 9 | ||
| | 1ULL << 10 | ||
| | 1ULL << 11 | ||
| | 1ULL << 12 | ||
| | 1ULL << 13 | ||
| | 1ULL << 32; | ||
| if ( ch > 32 ) { | ||
| return false; | ||
| } | ||
| return mask >> ch & 1; | ||
| } |
There was a problem hiding this comment.
[nitpick] Missing documentation for new function. Consider adding a comment explaining what this function does, similar to the comment on line 582 for IsNameStartChar. For example: "// Checks if a character is whitespace (tab, newline, vertical tab, form feed, carriage return, or space)"
There was a problem hiding this comment.
This is pretty opaque - if cool - function. Why does this even work? Is there a reference to where the algo comes from?
There was a problem hiding this comment.
This is just looking for one-of the characters, space, h/v tabs, line/form feed and CR.
The character itself cannot be a combination of two, ( because of the if ( ch > 32 ){ ... } check
So, if the return value is true, then is must be one of those characters.
So you end up with 2 checks only ... the >32 then the mask.
| if ( ch > 32 ) { | ||
| return false; | ||
| } | ||
| return mask >> ch & 1; |
There was a problem hiding this comment.
Missing parentheses in bitwise operation. The expression mask >> ch & 1 is evaluated as mask >> (ch & 1) due to operator precedence, not (mask >> ch) & 1 as intended. This will produce incorrect results for most inputs.
Fix: return (mask >> ch) & 1;
| return mask >> ch & 1; | |
| return (mask >> ch) & 1; |
| // ---------- Testing IsNameStartChar ---------- | ||
| { | ||
| XMLUtil test; | ||
| // Tests validate key edge cases for IsNameStartChar without exhaustive coverage | ||
| XMLTest("IsNameStartChar(':')", true, test.IsNameStartChar(':')); | ||
| XMLTest("IsNameStartChar('_')", true, test.IsNameStartChar('_')); | ||
| XMLTest("IsNameStartChar('@')", false, test.IsNameStartChar('@')); | ||
| XMLTest("IsNameStartChar('A')", true, test.IsNameStartChar('A')); | ||
| XMLTest("IsNameStartChar('Z')", true, test.IsNameStartChar('Z')); | ||
| XMLTest("IsNameStartChar('[')", false, test.IsNameStartChar('[')); | ||
| XMLTest("IsNameStartChar('`')", false, test.IsNameStartChar('`')); | ||
| XMLTest("IsNameStartChar('a')", true, test.IsNameStartChar('a')); | ||
| XMLTest("IsNameStartChar('z')", true, test.IsNameStartChar('z')); | ||
| XMLTest("IsNameStartChar('{')", false, test.IsNameStartChar('{')); | ||
| XMLTest("IsNameStartChar(127)", false, test.IsNameStartChar(static_cast<unsigned char>(127))); | ||
| XMLTest("IsNameStartChar(128)", true, test.IsNameStartChar(static_cast<unsigned char>(128))); | ||
| XMLTest("IsNameStartChar(255)", true, test.IsNameStartChar(static_cast<unsigned char>(255))); | ||
| } |
There was a problem hiding this comment.
Inconsistent indentation: this test block uses spaces for indentation, while the rest of the file uses tabs. Please convert the indentation to tabs to match the existing code style.
| // ---------- Testing IsNameStartChar ---------- | |
| { | |
| XMLUtil test; | |
| // Tests validate key edge cases for IsNameStartChar without exhaustive coverage | |
| XMLTest("IsNameStartChar(':')", true, test.IsNameStartChar(':')); | |
| XMLTest("IsNameStartChar('_')", true, test.IsNameStartChar('_')); | |
| XMLTest("IsNameStartChar('@')", false, test.IsNameStartChar('@')); | |
| XMLTest("IsNameStartChar('A')", true, test.IsNameStartChar('A')); | |
| XMLTest("IsNameStartChar('Z')", true, test.IsNameStartChar('Z')); | |
| XMLTest("IsNameStartChar('[')", false, test.IsNameStartChar('[')); | |
| XMLTest("IsNameStartChar('`')", false, test.IsNameStartChar('`')); | |
| XMLTest("IsNameStartChar('a')", true, test.IsNameStartChar('a')); | |
| XMLTest("IsNameStartChar('z')", true, test.IsNameStartChar('z')); | |
| XMLTest("IsNameStartChar('{')", false, test.IsNameStartChar('{')); | |
| XMLTest("IsNameStartChar(127)", false, test.IsNameStartChar(static_cast<unsigned char>(127))); | |
| XMLTest("IsNameStartChar(128)", true, test.IsNameStartChar(static_cast<unsigned char>(128))); | |
| XMLTest("IsNameStartChar(255)", true, test.IsNameStartChar(static_cast<unsigned char>(255))); | |
| } | |
| // ---------- Testing IsNameStartChar ---------- | |
| { | |
| XMLUtil test; | |
| // Tests validate key edge cases for IsNameStartChar without exhaustive coverage | |
| XMLTest("IsNameStartChar(':')", true, test.IsNameStartChar(':')); | |
| XMLTest("IsNameStartChar('_')", true, test.IsNameStartChar('_')); | |
| XMLTest("IsNameStartChar('@')", false, test.IsNameStartChar('@')); | |
| XMLTest("IsNameStartChar('A')", true, test.IsNameStartChar('A')); | |
| XMLTest("IsNameStartChar('Z')", true, test.IsNameStartChar('Z')); | |
| XMLTest("IsNameStartChar('[')", false, test.IsNameStartChar('[')); | |
| XMLTest("IsNameStartChar('`')", false, test.IsNameStartChar('`')); | |
| XMLTest("IsNameStartChar('a')", true, test.IsNameStartChar('a')); | |
| XMLTest("IsNameStartChar('z')", true, test.IsNameStartChar('z')); | |
| XMLTest("IsNameStartChar('{')", false, test.IsNameStartChar('{')); | |
| XMLTest("IsNameStartChar(127)", false, test.IsNameStartChar(static_cast<unsigned char>(127))); | |
| XMLTest("IsNameStartChar(128)", true, test.IsNameStartChar(static_cast<unsigned char>(128))); | |
| XMLTest("IsNameStartChar(255)", true, test.IsNameStartChar(static_cast<unsigned char>(255))); | |
| } |
| return IsSpace( static_cast<unsigned char>(p) ); | ||
| } | ||
|
|
||
| // The method checks a char for matching ':', '_', alphabetic symbols or char >= 128 by bit mask |
There was a problem hiding this comment.
[nitpick] Trailing whitespace at end of comment line. Remove the trailing space after "bit mask".
| // The method checks a char for matching ':', '_', alphabetic symbols or char >= 128 by bit mask | |
| // The method checks a char for matching ':', '_', alphabetic symbols or char >= 128 by bit mask |
Add new method IsAlpha and rewrite IsNameStartChar to avoid using library functions. Since the old functions were applied only to us-ASCII characters, bit mask can be used in rewritten methods. It gives a good acceleration: 11-15% x86, about 20% RISCV.