kev/Drawer/GVision/SurfaceGrid/GBKStripper.h

#pragma once

#include <string>
#include <vector>
#include <cctype>
#include <unordered_set>

/*
 * 专为 GB2312 编码 std::string 进行 strip，避免因中文导致崩溃
 */
class GBKStripper
{
private:
    struct GbkByteRange
    {
        uint8_t high_start = 0;
        uint8_t high_end = 0;
        uint8_t low_start = 0;
		uint8_t low_end = 0;
        std::unordered_set<uint8_t> excluded_bytes;

        // 辅助函数：判断字节对是否在该范围内
        bool contains(uint8_t high, uint8_t low) const
        {
            if (high < high_start || high > high_end)
            {
                return false;
            }

            if (low < low_start || low > low_end)
            {
                return false;
            }

            return excluded_bytes.count(low) == 0; // 不在排除列表中
        }
    };

    static const std::vector<GbkByteRange> gbk_byte_ranges;

    static bool is_gbk_lead_byte(uint8_t byte)
    {
        return byte >= 0x80; // GBK 高字节 >= 0x80
    }

    // 判断是否为 GBK 全角空格
    static bool is_gbk_full_width_space(uint8_t high, uint8_t low)
    {
        return high == 0xA1 && low == 0xA0; // GBK 全角空格编码
    }

    static bool is_valid_gbk_char(const std::string& str, size_t pos)
    {
        if (pos + 1 >= str.length())
        {
            return false;
        }

        uint8_t high = static_cast<uint8_t>(str[pos]);
        uint8_t low = static_cast<uint8_t>(str[pos + 1]);

        // 遍历所有合法范围，找到匹配的即返回 true
        for (const auto& range : gbk_byte_ranges)
        {
            if (range.contains(high, low))
            {
                return true;
            }
        }
        return false;
    }

    static bool is_whitespace_char(const std::string& str, size_t pos)
    {
        if (pos >= str.length())
        {
            return false;
        }

        uint8_t byte = static_cast<uint8_t>(str[pos]);

        // ASCII 空白字符
        if (byte < 0x80)
        {
            return std::isspace(byte);
        }

        // GBK 双字节字符
        if (is_valid_gbk_char(str, pos))
        {
            uint8_t high = static_cast<uint8_t>(str[pos]);
            uint8_t low = static_cast<uint8_t>(str[pos + 1]);

            // 全角空格：0xA1A0
            if (is_gbk_full_width_space(high, low))
            {
                return true;
            }
        }

        return false;
    }

    // 获取下一个字符位置
    static size_t next_char_pos(const std::string& str, size_t pos)
    {
        if (pos >= str.length())
        {
            return pos;
        }

        uint8_t current_byte = static_cast<uint8_t>(str[pos]);

        // 如果是 GBK 高字节且下一位存在且构成合法字符，则跳 2 字节
        if (is_gbk_lead_byte(current_byte)
            && pos + 1 < str.length()
            && is_valid_gbk_char(str, pos))
        {
            return pos + 2;
        }

        return pos + 1; // 否则按单字节处理
    }

    // 获取上一个字符位置
    static size_t prev_char_pos(const std::string& str, size_t pos)
    {
        if (pos == 0 || pos > str.length())
        {
            return 0;
        }

        // 从当前位置向前找，直到找到字符开头或到达字符串开头
        for (size_t i = pos - 1; i > 0; --i)
        {
            uint8_t current = static_cast<uint8_t>(str[i]);
            uint8_t prev = static_cast<uint8_t>(str[i - 1]);

            // 如果前两个字节构成合法 GBK 字符，则 i-1 就是字符开头
            if (is_valid_gbk_char(str, i - 1))
            {
                return i - 1;
            }

            // 如果当前字节是高字节（>=0x80），说明 i 是新字符开头
            if (current >= 0x80)
            {
                return i;
            }

            // 如果前一个字节不是高字节，继续向前
        }

        return 0; // 到达字符串开头
    }

public:
    /**
     * @brief 去除字符串首尾的空白字符（包括 GBK 全角空格）
     * @param str 输入的字符串
     * @return 去除首尾空白后的字符串
     */
    static std::string strip(const std::string& str)
    {
        return rstrip(lstrip(str));
    }

    /**
     * @brief 去除字符串左侧的空白字符（包括 GBK 全角空格）
     * @param str 输入的字符串
     * @return 去除左侧空白后的字符串
     */
    static std::string lstrip(const std::string& str)
    {
        if (str.empty())
        {
            return str;
        }

        size_t start = 0;
        const size_t len = str.length();

        while (start < len && is_whitespace_char(str, start))
        {
            start = next_char_pos(str, start);
        }

        return str.substr(start);
    }

	/**
     * @brief 去除字符串右侧的空白字符（包括 GBK 全角空格）
     * @param str 输入的字符串
     * @return 去除右侧空白后的字符串
     */
    static std::string rstrip(const std::string& str)
    {
        if (str.empty())
        {
            return str;
        }

        size_t end = str.length();

        while (end > 0)
        {
            size_t prev_pos = prev_char_pos(str, end);
            if (is_whitespace_char(str, prev_pos))
            {
                end = prev_pos;
            }
            else
            {
                break;
            }
        }

        return str.substr(0, end);
    }
};

// gb2312 范围表 https://zhuanlan.zhihu.com/p/453675608
const std::vector<GBKStripper::GbkByteRange> GBKStripper::gbk_byte_ranges = {
    {0xA1, 0xA9, 0xA1, 0xFE, {}},                   // 0xA1–0xA9 + 0xA1–0xFE
    {0xB0, 0xF7, 0xA1, 0xFE, {}},                   // 0xB0–0xF7 + 0xA1–0xFE
    {0x81, 0xA0, 0x40, 0xFE, {0x7F}},               // 0x81–0xA0 + 0x40–0xFE (排除 0x7F)
    {0xA8, 0xA9, 0x40, 0xA0, {0x7F}},               // 0xA8–0xA9 + 0x40–0xA0 (排除 0x7F)
    {0xAA, 0xFE, 0x40, 0xA0, {0x7F}},               // 0xAA–0xFE + 0x40–0xA0 (排除 0x7F)

    // 可选：自定义区（如果需要支持）
    // {0xA1, 0xA7, 0x40, 0xA0, {0x7F}},           // 自定义区
    // {0xAA, 0xAF, 0xA1, 0xFE, {}},               // 自定义区
    // {0xF8, 0xFE, 0xA1, 0xFE, {}},               // 自定义区
};