You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
6.2 KiB
C

1 month ago
#pragma once
#include <string>
#include <vector>
#include <cctype>
#include <unordered_set>
/*
* GB2312 std::string strip
*/
class GBKStripper
{
private:
struct GbkByteRange
{
uint8_t high_start = 0;
uint8_t high_end = 0;
uint8_t low_start = 0;
uint8_t low_end = 0;
std::unordered_set<uint8_t> excluded_bytes;
// 辅助函数:判断字节对是否在该范围内
bool contains(uint8_t high, uint8_t low) const
{
if (high < high_start || high > high_end)
{
return false;
}
if (low < low_start || low > low_end)
{
return false;
}
return excluded_bytes.count(low) == 0; // 不在排除列表中
}
};
static const std::vector<GbkByteRange> gbk_byte_ranges;
static bool is_gbk_lead_byte(uint8_t byte)
{
return byte >= 0x80; // GBK 高字节 >= 0x80
}
// 判断是否为 GBK 全角空格
static bool is_gbk_full_width_space(uint8_t high, uint8_t low)
{
return high == 0xA1 && low == 0xA0; // GBK 全角空格编码
}
static bool is_valid_gbk_char(const std::string& str, size_t pos)
{
if (pos + 1 >= str.length())
{
return false;
}
uint8_t high = static_cast<uint8_t>(str[pos]);
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
// 遍历所有合法范围,找到匹配的即返回 true
for (const auto& range : gbk_byte_ranges)
{
if (range.contains(high, low))
{
return true;
}
}
return false;
}
static bool is_whitespace_char(const std::string& str, size_t pos)
{
if (pos >= str.length())
{
return false;
}
uint8_t byte = static_cast<uint8_t>(str[pos]);
// ASCII 空白字符
if (byte < 0x80)
{
return std::isspace(byte);
}
// GBK 双字节字符
if (is_valid_gbk_char(str, pos))
{
uint8_t high = static_cast<uint8_t>(str[pos]);
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
// 全角空格0xA1A0
if (is_gbk_full_width_space(high, low))
{
return true;
}
}
return false;
}
// 获取下一个字符位置
static size_t next_char_pos(const std::string& str, size_t pos)
{
if (pos >= str.length())
{
return pos;
}
uint8_t current_byte = static_cast<uint8_t>(str[pos]);
// 如果是 GBK 高字节且下一位存在且构成合法字符,则跳 2 字节
if (is_gbk_lead_byte(current_byte)
&& pos + 1 < str.length()
&& is_valid_gbk_char(str, pos))
{
return pos + 2;
}
return pos + 1; // 否则按单字节处理
}
// 获取上一个字符位置
static size_t prev_char_pos(const std::string& str, size_t pos)
{
if (pos == 0 || pos > str.length())
{
return 0;
}
// 从当前位置向前找,直到找到字符开头或到达字符串开头
for (size_t i = pos - 1; i > 0; --i)
{
uint8_t current = static_cast<uint8_t>(str[i]);
uint8_t prev = static_cast<uint8_t>(str[i - 1]);
// 如果前两个字节构成合法 GBK 字符,则 i-1 就是字符开头
if (is_valid_gbk_char(str, i - 1))
{
return i - 1;
}
// 如果当前字节是高字节(>=0x80说明 i 是新字符开头
if (current >= 0x80)
{
return i;
}
// 如果前一个字节不是高字节,继续向前
}
return 0; // 到达字符串开头
}
public:
/**
* @brief GBK
* @param str
* @return
*/
static std::string strip(const std::string& str)
{
return rstrip(lstrip(str));
}
/**
* @brief GBK
* @param str
* @return
*/
static std::string lstrip(const std::string& str)
{
if (str.empty())
{
return str;
}
size_t start = 0;
const size_t len = str.length();
while (start < len && is_whitespace_char(str, start))
{
start = next_char_pos(str, start);
}
return str.substr(start);
}
/**
* @brief GBK
* @param str
* @return
*/
static std::string rstrip(const std::string& str)
{
if (str.empty())
{
return str;
}
size_t end = str.length();
while (end > 0)
{
size_t prev_pos = prev_char_pos(str, end);
if (is_whitespace_char(str, prev_pos))
{
end = prev_pos;
}
else
{
break;
}
}
return str.substr(0, end);
}
};
// gb2312 范围表 https://zhuanlan.zhihu.com/p/453675608
const std::vector<GBKStripper::GbkByteRange> GBKStripper::gbk_byte_ranges = {
{0xA1, 0xA9, 0xA1, 0xFE, {}}, // 0xA10xA9 + 0xA10xFE
{0xB0, 0xF7, 0xA1, 0xFE, {}}, // 0xB00xF7 + 0xA10xFE
{0x81, 0xA0, 0x40, 0xFE, {0x7F}}, // 0x810xA0 + 0x400xFE (排除 0x7F)
{0xA8, 0xA9, 0x40, 0xA0, {0x7F}}, // 0xA80xA9 + 0x400xA0 (排除 0x7F)
{0xAA, 0xFE, 0x40, 0xA0, {0x7F}}, // 0xAA0xFE + 0x400xA0 (排除 0x7F)
// 可选:自定义区(如果需要支持)
// {0xA1, 0xA7, 0x40, 0xA0, {0x7F}}, // 自定义区
// {0xAA, 0xAF, 0xA1, 0xFE, {}}, // 自定义区
// {0xF8, 0xFE, 0xA1, 0xFE, {}}, // 自定义区
};