|
|
#include "StreamingTsvParser.h"
|
|
|
#include <algorithm>
|
|
|
#include <cstring>
|
|
|
|
|
|
namespace
|
|
|
{
|
|
|
/// 从 buf 中取出一行(到 \n 或 \r\n),写入 line,返回消费的字节数
|
|
|
size_t TakeLine(const std::string& buf, std::string& line)
|
|
|
{
|
|
|
size_t i = 0;
|
|
|
while (i < buf.size() && buf[i] != '\n' && buf[i] != '\r')
|
|
|
{
|
|
|
++i;
|
|
|
}
|
|
|
line = buf.substr(0, i);
|
|
|
if (i < buf.size())
|
|
|
{
|
|
|
++i;
|
|
|
if (i < buf.size() && buf[i - 1] == '\r' && buf[i] == '\n')
|
|
|
{
|
|
|
++i;
|
|
|
}
|
|
|
}
|
|
|
return i;
|
|
|
}
|
|
|
|
|
|
void SplitByTab(const std::string& line, std::vector<std::string>& out)
|
|
|
{
|
|
|
out.clear();
|
|
|
size_t start = 0;
|
|
|
for (size_t i = 0; i <= line.size(); ++i)
|
|
|
{
|
|
|
if (i == line.size() || line[i] == '\t')
|
|
|
{
|
|
|
out.push_back(line.substr(start, i - start));
|
|
|
start = i + 1;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void StreamingTsvParser::Feed(const void* data, size_t size)
|
|
|
{
|
|
|
if (data == nullptr || size == 0)
|
|
|
{
|
|
|
return;
|
|
|
}
|
|
|
const char* p = static_cast<const char*>(data);
|
|
|
m_buffer.append(p, size);
|
|
|
ProcessBuffer();
|
|
|
}
|
|
|
|
|
|
void StreamingTsvParser::End()
|
|
|
{
|
|
|
// 将剩余内容当作最后一行(可能无换行符)
|
|
|
if (!m_buffer.empty())
|
|
|
{
|
|
|
std::vector<std::string> fields;
|
|
|
SplitByTab(m_buffer, fields);
|
|
|
if (!fields.empty() && m_callback)
|
|
|
{
|
|
|
m_callback(m_lineIndex, fields);
|
|
|
}
|
|
|
m_buffer.clear();
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void StreamingTsvParser::ProcessBuffer()
|
|
|
{
|
|
|
if (!m_callback)
|
|
|
{
|
|
|
return;
|
|
|
}
|
|
|
std::string line;
|
|
|
size_t consumed = TakeLine(m_buffer, line);
|
|
|
while (consumed > 0)
|
|
|
{
|
|
|
std::vector<std::string> fields;
|
|
|
SplitByTab(line, fields);
|
|
|
if (!fields.empty())
|
|
|
{
|
|
|
m_callback(m_lineIndex++, fields);
|
|
|
}
|
|
|
m_buffer.erase(0, consumed);
|
|
|
consumed = TakeLine(m_buffer, line);
|
|
|
}
|
|
|
}
|