2022-06-27 – tolower() in bulk at speed 一文的阅读笔记。承接上文 如何快速寻找第一个非 ASCII 字符位置,如果我们知道一个字符串是纯 ASCII,那么我们可以在字符串处理上做一些更多的优化
这里先放一张 ASCII 表,方便查阅
对于大写字符其 ASCII 值小于对应的小写字符,并且差值为 0x20
toLower
的实现#
核心思路还是批处理和位标记。我们来看一下实现的细节,函数如下
uint64_t tolower8(uint64_t octets) {
uint64_t all_bytes = 0x0101010101010101;
uint64_t heptets = octets & (0x7F * all_bytes);
uint64_t is_gt_Z = heptets + (0x7F - 'Z') * all_bytes;
uint64_t is_ge_A = heptets + (0x80 - 'A') * all_bytes;
uint64_t is_ascii = ~octets & (0x80 * all_bytes);
uint64_t is_upper = is_ascii & (is_ge_A ^ is_gt_Z);
return (octets | is_upper >> 2);
}
uint64_t all_bytes = 0x0101010101010101;
这个是一个辅助掩码,通过乘法运算可以将单字节扩展到 8 字节。举个例子,比如 (0x7F - 'Z') * all_bytes
(0x7F - 'Z') = 0x25
0x25 * 0x0101010101010101 = 0x2525252525252525
编译阶段涉及到 all_bytes
的乘法运算会直接折叠成常量
// uint64_t heptets = octets & (0x7F * all_bytes);
uint64_t heptets = octets & 0x7F7F7F7F7F7F7F7F
heptets
对于每个字节和 0x7F
进行与运算,将最高有效位置为 0
。方便在之后的逻辑中最高有效位用来作为标记位使用。
我们来关注一下 is_gt_Z
和 is_ge_A
两个变量的用途
uint64_t is_gt_Z = heptets + (0x7F - 'Z') * all_bytes;
uint64_t is_ge_A = heptets + (0x80 - 'A') * all_bytes;
这里是利用了位标记的技巧。ASCII 的最大范围是 0x7F
,即 0111_1111
,最高位有效位是 0
。如果我们向任一个字节加上 (0x7F - 'Z')
,使最高有效位变为 1
,那么说明大于 Z
。如下
A = 0x41
Z = 0x5A
[ = 0x5B
(0x7F - 'Z') = 0x25
0x41 + 0x25 = 0x66 = 0110_0110
0x5A + 0x25 = 0x7F = 0111_1111
0x5B + 0x25 = 0x80 = 1000_0000
所以我们现在通过着两个变量可以确定一个范围
is_gt_Z
: 最高有效位为1
的字节为大于Z
的字符is_ge_A
: 最高有效位为1
的字节为大于A
的字符
通过位反操作,然后和 (0x80 * all_bytes)
进行与运算,这样可以通过最高有效位筛选出原始字符串中的 ASCII 部分。
uint64_t is_ascii = ~octets & (0x80 * all_bytes);
uint64_t is_upper = is_ascii & (is_ge_A ^ is_gt_Z);
对于任何满足 is_gt_Z
的字符也会满足 is_ge_A
;对于任何不满足 is_ge_A
的字符,也不会满足 is_gt_Z
。因此这里可以通过异或运算来筛选出处于大写区间的字符,即 (is_ge_A ^ is_gt_Z)
。在这个区间里面的字符,最高有效位均为 1
。因此 is_ascii
和 (is_ge_A ^ is_gt_Z)
进行与运算,结果 is_upper
中的所有最高有效位为 1
的字节均为大写字符
最后一步,我们将大写字符转换成小写。在 ASCII 中,大写字母和其所对应小写字母的差值为 0x20
即 0010_0000
。为了获得转换值,我们需要将 0x80
标志位移动到 0x20
位置。最后的或运算用于实现加法
uint64_t to_lower = (is_upper >> 2) & (0x20 * all_bytes);
return (octets | to_lower);
以 aA
为例,每一步的计算结果如下
十六进制值 | 二进制值 | |
原始值 | 0x6141 | 01100001 01000001 |
heptets | 0x6141 | 01100001 01000001 |
is_gt_Z | 0x8666 | 10000110 01100110 |
is_ge_A | 0xa080 | 10100000 10000000 |
(is_ge_A ^ is_gt_Z) | 0x26e6 | 00100110 11100110 |
is_ascii | 0x8080 | 10000000 10000000 |
is_upper | 0x0080 | 00000000 10000000 |
to_lower | 0x0020 | 00000000 00100000 |
toUpper
的实现#
根据上面的思路,可以依葫芦画瓢实现一个 toUpper
,完整代码如下
#include <ctype.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define MAX_STRING_LENGTH 10000
#define TEST_CASES 100000
#define VECTOR_MASK 0x0101010101010101ull
#define ASCII_MASK 0x8080808080808080ull
static inline uint64_t toupper8(uint64_t octets)
{
uint64_t heptets = octets & (0x7F * VECTOR_MASK);
uint64_t is_gt_z = heptets + (0x7f - 'z') * VECTOR_MASK;
uint64_t is_ge_a = heptets + (0x80 - 'a') * VECTOR_MASK;
uint64_t is_ascii = ~octets & (0x80 * VECTOR_MASK);
uint64_t is_lower = is_ascii & (is_ge_a ^ is_gt_z);
return octets & ~(is_lower >> 2);
}
size_t ya_toupper(const char* input, size_t length, char* output)
{
const char* p = input;
char* out_p = output;
const char* end = input + length;
while (p <= end - 8) {
uint64_t octets = *(const uint64_t*)p;
uint64_t result = toupper8(octets);
*(uint64_t*)out_p = result;
p += 8;
out_p += 8;
}
uint64_t u = 0;
size_t remaining = end - p;
switch (remaining) {
default:
u |= (uint64_t)(unsigned char)p[7] << 56ull;
case 7:
u |= (uint64_t)(unsigned char)p[6] << 48ull;
case 6:
u |= (uint64_t)(unsigned char)p[5] << 40ull;
case 5:
u |= (uint64_t)(unsigned char)p[4] << 32ull;
case 4:
u |= (uint64_t)(unsigned char)p[3] << 24ull;
case 3:
u |= (uint64_t)(unsigned char)p[2] << 16ull;
case 2:
u |= (uint64_t)(unsigned char)p[1] << 8ull;
case 1:
u |= (uint64_t)(unsigned char)p[0];
break;
case 0:
break;
}
if (remaining > 0) {
uint64_t result = toupper8(u);
for (size_t i = 0; i < remaining; ++i) {
out_p[i] = (char)(result & 0xFF);
result >>= 8;
}
out_p += remaining;
}
return (size_t)(out_p - output);
}
int main()
{
srand((unsigned int)time(NULL));
char input[MAX_STRING_LENGTH + 1];
char expected[MAX_STRING_LENGTH + 1];
char actual[MAX_STRING_LENGTH + 1];
for (int i = 0; i < TEST_CASES; ++i) {
size_t length = rand() % MAX_STRING_LENGTH + 1;
for (size_t j = 0; j < length; ++j) {
char rand_value = (char)(rand() % 256);
input[j] = (rand_value);
}
input[length] = '\0';
for (size_t j = 0; j < length; ++j) {
expected[j] = toupper((unsigned char)input[j]);
}
expected[length] = '\0';
size_t actual_length = ya_toupper(input, length, actual);
actual[actual_length] = '\0';
if (strcmp(expected, actual) != 0) {
printf("Test case %d failed\n", i + 1);
printf("Length : %zd\n", length);
printf("Input : ");
for (size_t j = 0; j < length; ++j) {
printf("%02X ", (unsigned char)input[j]);
}
printf("\n");
printf("Expected: ");
for (size_t j = 0; j < length; ++j) {
printf("%02X ", (unsigned char)expected[j]);
}
printf("\n");
printf("Actual : ");
for (size_t j = 0; j < length; ++j) {
printf("%02X ", (unsigned char)actual[j]);
}
printf("\n");
return 1;
}
}
printf("All test cases passed!\n");
return 0;
}