|
|
#pragma once
|
|
|
|
|
|
#include <string>
|
|
|
#include <vector>
|
|
|
#include <cctype>
|
|
|
#include <unordered_set>
|
|
|
|
|
|
/*
|
|
|
* 专为 GB2312 编码 std::string 进行 strip,避免因中文导致崩溃
|
|
|
*/
|
|
|
class GBKStripper
|
|
|
{
|
|
|
private:
|
|
|
struct GbkByteRange
|
|
|
{
|
|
|
uint8_t high_start = 0;
|
|
|
uint8_t high_end = 0;
|
|
|
uint8_t low_start = 0;
|
|
|
uint8_t low_end = 0;
|
|
|
std::unordered_set<uint8_t> excluded_bytes;
|
|
|
|
|
|
// 辅助函数:判断字节对是否在该范围内
|
|
|
bool contains(uint8_t high, uint8_t low) const
|
|
|
{
|
|
|
if (high < high_start || high > high_end)
|
|
|
{
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (low < low_start || low > low_end)
|
|
|
{
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
return excluded_bytes.count(low) == 0; // 不在排除列表中
|
|
|
}
|
|
|
};
|
|
|
|
|
|
static const std::vector<GbkByteRange> gbk_byte_ranges;
|
|
|
|
|
|
static bool is_gbk_lead_byte(uint8_t byte)
|
|
|
{
|
|
|
return byte >= 0x80; // GBK 高字节 >= 0x80
|
|
|
}
|
|
|
|
|
|
// 判断是否为 GBK 全角空格
|
|
|
static bool is_gbk_full_width_space(uint8_t high, uint8_t low)
|
|
|
{
|
|
|
return high == 0xA1 && low == 0xA0; // GBK 全角空格编码
|
|
|
}
|
|
|
|
|
|
static bool is_valid_gbk_char(const std::string& str, size_t pos)
|
|
|
{
|
|
|
if (pos + 1 >= str.length())
|
|
|
{
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
uint8_t high = static_cast<uint8_t>(str[pos]);
|
|
|
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
|
|
|
|
|
|
// 遍历所有合法范围,找到匹配的即返回 true
|
|
|
for (const auto& range : gbk_byte_ranges)
|
|
|
{
|
|
|
if (range.contains(high, low))
|
|
|
{
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
static bool is_whitespace_char(const std::string& str, size_t pos)
|
|
|
{
|
|
|
if (pos >= str.length())
|
|
|
{
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
uint8_t byte = static_cast<uint8_t>(str[pos]);
|
|
|
|
|
|
// ASCII 空白字符
|
|
|
if (byte < 0x80)
|
|
|
{
|
|
|
return std::isspace(byte);
|
|
|
}
|
|
|
|
|
|
// GBK 双字节字符
|
|
|
if (is_valid_gbk_char(str, pos))
|
|
|
{
|
|
|
uint8_t high = static_cast<uint8_t>(str[pos]);
|
|
|
uint8_t low = static_cast<uint8_t>(str[pos + 1]);
|
|
|
|
|
|
// 全角空格:0xA1A0
|
|
|
if (is_gbk_full_width_space(high, low))
|
|
|
{
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
// 获取下一个字符位置
|
|
|
static size_t next_char_pos(const std::string& str, size_t pos)
|
|
|
{
|
|
|
if (pos >= str.length())
|
|
|
{
|
|
|
return pos;
|
|
|
}
|
|
|
|
|
|
uint8_t current_byte = static_cast<uint8_t>(str[pos]);
|
|
|
|
|
|
// 如果是 GBK 高字节且下一位存在且构成合法字符,则跳 2 字节
|
|
|
if (is_gbk_lead_byte(current_byte)
|
|
|
&& pos + 1 < str.length()
|
|
|
&& is_valid_gbk_char(str, pos))
|
|
|
{
|
|
|
return pos + 2;
|
|
|
}
|
|
|
|
|
|
return pos + 1; // 否则按单字节处理
|
|
|
}
|
|
|
|
|
|
// 获取上一个字符位置
|
|
|
static size_t prev_char_pos(const std::string& str, size_t pos)
|
|
|
{
|
|
|
if (pos == 0 || pos > str.length())
|
|
|
{
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
// 从当前位置向前找,直到找到字符开头或到达字符串开头
|
|
|
for (size_t i = pos - 1; i > 0; --i)
|
|
|
{
|
|
|
uint8_t current = static_cast<uint8_t>(str[i]);
|
|
|
uint8_t prev = static_cast<uint8_t>(str[i - 1]);
|
|
|
|
|
|
// 如果前两个字节构成合法 GBK 字符,则 i-1 就是字符开头
|
|
|
if (is_valid_gbk_char(str, i - 1))
|
|
|
{
|
|
|
return i - 1;
|
|
|
}
|
|
|
|
|
|
// 如果当前字节是高字节(>=0x80),说明 i 是新字符开头
|
|
|
if (current >= 0x80)
|
|
|
{
|
|
|
return i;
|
|
|
}
|
|
|
|
|
|
// 如果前一个字节不是高字节,继续向前
|
|
|
}
|
|
|
|
|
|
return 0; // 到达字符串开头
|
|
|
}
|
|
|
|
|
|
public:
|
|
|
/**
|
|
|
* @brief 去除字符串首尾的空白字符(包括 GBK 全角空格)
|
|
|
* @param str 输入的字符串
|
|
|
* @return 去除首尾空白后的字符串
|
|
|
*/
|
|
|
static std::string strip(const std::string& str)
|
|
|
{
|
|
|
return rstrip(lstrip(str));
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @brief 去除字符串左侧的空白字符(包括 GBK 全角空格)
|
|
|
* @param str 输入的字符串
|
|
|
* @return 去除左侧空白后的字符串
|
|
|
*/
|
|
|
static std::string lstrip(const std::string& str)
|
|
|
{
|
|
|
if (str.empty())
|
|
|
{
|
|
|
return str;
|
|
|
}
|
|
|
|
|
|
size_t start = 0;
|
|
|
const size_t len = str.length();
|
|
|
|
|
|
while (start < len && is_whitespace_char(str, start))
|
|
|
{
|
|
|
start = next_char_pos(str, start);
|
|
|
}
|
|
|
|
|
|
return str.substr(start);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @brief 去除字符串右侧的空白字符(包括 GBK 全角空格)
|
|
|
* @param str 输入的字符串
|
|
|
* @return 去除右侧空白后的字符串
|
|
|
*/
|
|
|
static std::string rstrip(const std::string& str)
|
|
|
{
|
|
|
if (str.empty())
|
|
|
{
|
|
|
return str;
|
|
|
}
|
|
|
|
|
|
size_t end = str.length();
|
|
|
|
|
|
while (end > 0)
|
|
|
{
|
|
|
size_t prev_pos = prev_char_pos(str, end);
|
|
|
if (is_whitespace_char(str, prev_pos))
|
|
|
{
|
|
|
end = prev_pos;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return str.substr(0, end);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
// gb2312 范围表 https://zhuanlan.zhihu.com/p/453675608
|
|
|
const std::vector<GBKStripper::GbkByteRange> GBKStripper::gbk_byte_ranges = {
|
|
|
{0xA1, 0xA9, 0xA1, 0xFE, {}}, // 0xA1–0xA9 + 0xA1–0xFE
|
|
|
{0xB0, 0xF7, 0xA1, 0xFE, {}}, // 0xB0–0xF7 + 0xA1–0xFE
|
|
|
{0x81, 0xA0, 0x40, 0xFE, {0x7F}}, // 0x81–0xA0 + 0x40–0xFE (排除 0x7F)
|
|
|
{0xA8, 0xA9, 0x40, 0xA0, {0x7F}}, // 0xA8–0xA9 + 0x40–0xA0 (排除 0x7F)
|
|
|
{0xAA, 0xFE, 0x40, 0xA0, {0x7F}}, // 0xAA–0xFE + 0x40–0xA0 (排除 0x7F)
|
|
|
|
|
|
// 可选:自定义区(如果需要支持)
|
|
|
// {0xA1, 0xA7, 0x40, 0xA0, {0x7F}}, // 自定义区
|
|
|
// {0xAA, 0xAF, 0xA1, 0xFE, {}}, // 自定义区
|
|
|
// {0xF8, 0xFE, 0xA1, 0xFE, {}}, // 自定义区
|
|
|
}; |