You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
6.2 KiB
C++

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#pragma once
#include <string>
#include <vector>
#include <cctype>
#include <unordered_set>
/*
* 专为 GB2312 编码 std::string 进行 strip避免因中文导致崩溃
*/
class GBKStripper
{
private:
struct GbkByteRange
{
uint8_t high_start = 0;
uint8_t high_end = 0;
uint8_t low_start = 0;
uint8_t low_end = 0;
std::unordered_set<uint8_t> excluded_bytes;
// 辅助函数:判断字节对是否在该范围内
bool contains(uint8_t high, uint8_t low) const
{
if (high < high_start || high > high_end)
{
return false;
}
if (low < low_start || low > low_end)
{
return false;
}
return excluded_bytes.count(low) == 0; // 不在排除列表中
}
};
static const std::vector<GbkByteRange> gbk_byte_ranges;
static bool is_gbk_lead_byte(uint8_t byte)
{
return byte >= 0x80; // GBK 高字节 >= 0x80
}
// 判断是否为 GBK 全角空格
static bool is_gbk_full_width_space(uint8_t high, uint8_t low)
{
return high == 0xA1 && low == 0xA0; // GBK 全角空格编码
}
static bool is_valid_gbk_char(const std::string& str, size_t pos)
{
if (pos + 1 >= str.length())
{
return false;
}
uint8_t high = static_cast<uint8_t>(str[pos]);
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
// 遍历所有合法范围,找到匹配的即返回 true
for (const auto& range : gbk_byte_ranges)
{
if (range.contains(high, low))
{
return true;
}
}
return false;
}
static bool is_whitespace_char(const std::string& str, size_t pos)
{
if (pos >= str.length())
{
return false;
}
uint8_t byte = static_cast<uint8_t>(str[pos]);
// ASCII 空白字符
if (byte < 0x80)
{
return std::isspace(byte);
}
// GBK 双字节字符
if (is_valid_gbk_char(str, pos))
{
uint8_t high = static_cast<uint8_t>(str[pos]);
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
// 全角空格0xA1A0
if (is_gbk_full_width_space(high, low))
{
return true;
}
}
return false;
}
// 获取下一个字符位置
static size_t next_char_pos(const std::string& str, size_t pos)
{
if (pos >= str.length())
{
return pos;
}
uint8_t current_byte = static_cast<uint8_t>(str[pos]);
// 如果是 GBK 高字节且下一位存在且构成合法字符,则跳 2 字节
if (is_gbk_lead_byte(current_byte)
&& pos + 1 < str.length()
&& is_valid_gbk_char(str, pos))
{
return pos + 2;
}
return pos + 1; // 否则按单字节处理
}
// 获取上一个字符位置
static size_t prev_char_pos(const std::string& str, size_t pos)
{
if (pos == 0 || pos > str.length())
{
return 0;
}
// 从当前位置向前找,直到找到字符开头或到达字符串开头
for (size_t i = pos - 1; i > 0; --i)
{
uint8_t current = static_cast<uint8_t>(str[i]);
uint8_t prev = static_cast<uint8_t>(str[i - 1]);
// 如果前两个字节构成合法 GBK 字符,则 i-1 就是字符开头
if (is_valid_gbk_char(str, i - 1))
{
return i - 1;
}
// 如果当前字节是高字节(>=0x80说明 i 是新字符开头
if (current >= 0x80)
{
return i;
}
// 如果前一个字节不是高字节,继续向前
}
return 0; // 到达字符串开头
}
public:
/**
* @brief 去除字符串首尾的空白字符(包括 GBK 全角空格)
* @param str 输入的字符串
* @return 去除首尾空白后的字符串
*/
static std::string strip(const std::string& str)
{
return rstrip(lstrip(str));
}
/**
* @brief 去除字符串左侧的空白字符(包括 GBK 全角空格)
* @param str 输入的字符串
* @return 去除左侧空白后的字符串
*/
static std::string lstrip(const std::string& str)
{
if (str.empty())
{
return str;
}
size_t start = 0;
const size_t len = str.length();
while (start < len && is_whitespace_char(str, start))
{
start = next_char_pos(str, start);
}
return str.substr(start);
}
/**
* @brief 去除字符串右侧的空白字符(包括 GBK 全角空格)
* @param str 输入的字符串
* @return 去除右侧空白后的字符串
*/
static std::string rstrip(const std::string& str)
{
if (str.empty())
{
return str;
}
size_t end = str.length();
while (end > 0)
{
size_t prev_pos = prev_char_pos(str, end);
if (is_whitespace_char(str, prev_pos))
{
end = prev_pos;
}
else
{
break;
}
}
return str.substr(0, end);
}
};
// gb2312 范围表 https://zhuanlan.zhihu.com/p/453675608
const std::vector<GBKStripper::GbkByteRange> GBKStripper::gbk_byte_ranges = {
{0xA1, 0xA9, 0xA1, 0xFE, {}}, // 0xA10xA9 + 0xA10xFE
{0xB0, 0xF7, 0xA1, 0xFE, {}}, // 0xB00xF7 + 0xA10xFE
{0x81, 0xA0, 0x40, 0xFE, {0x7F}}, // 0x810xA0 + 0x400xFE (排除 0x7F)
{0xA8, 0xA9, 0x40, 0xA0, {0x7F}}, // 0xA80xA9 + 0x400xA0 (排除 0x7F)
{0xAA, 0xFE, 0x40, 0xA0, {0x7F}}, // 0xAA0xFE + 0x400xA0 (排除 0x7F)
// 可选:自定义区(如果需要支持)
// {0xA1, 0xA7, 0x40, 0xA0, {0x7F}}, // 自定义区
// {0xAA, 0xAF, 0xA1, 0xFE, {}}, // 自定义区
// {0xF8, 0xFE, 0xA1, 0xFE, {}}, // 自定义区
};