You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kev/Drawer/TinyChat/Messages/Formatting/PlainTextMessageFormatter.cs

140 lines
5.0 KiB
C#

using System.Text.RegularExpressions;
namespace TinyChat.Messages.Formatting;
/// <summary>
/// Renders message content as plain text by stripping common formatting like Markdown and HTML.
/// </summary>
public partial class PlainTextMessageFormatter : IMessageFormatter
{
[GeneratedRegex(@"<[^>]*>", RegexOptions.Compiled)]
private static partial Regex HtmlTagsRegex();
[GeneratedRegex(@"!\[([^\]]*)\]\([^\)]+\)", RegexOptions.Compiled)]
private static partial Regex MarkdownImagesRegex();
[GeneratedRegex(@"\[([^\]]+)\]\([^\)]+\)", RegexOptions.Compiled)]
private static partial Regex MarkdownLinksRegex();
[GeneratedRegex(@"^#{1,6}\s+", RegexOptions.Compiled | RegexOptions.Multiline)]
private static partial Regex MarkdownHeadersRegex();
[GeneratedRegex(@"(\*\*|__)(.*?)\1", RegexOptions.Compiled)]
private static partial Regex MarkdownBoldRegex();
[GeneratedRegex(@"(\*|_)(.*?)\1", RegexOptions.Compiled)]
private static partial Regex MarkdownItalicRegex();
[GeneratedRegex(@"~~(.*?)~~", RegexOptions.Compiled)]
private static partial Regex MarkdownStrikethroughRegex();
[GeneratedRegex(@"`([^`]*)`", RegexOptions.Compiled)]
private static partial Regex MarkdownInlineCodeRegex();
[GeneratedRegex(@"```(?:\w+)?(?:\s*\n)?([\s\S]*?)```", RegexOptions.Compiled)]
private static partial Regex MarkdownCodeBlockRegex();
[GeneratedRegex(@"<ul[^>]*>(.*?)</ul>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline)]
private static partial Regex HtmlUnorderedListRegex();
[GeneratedRegex(@"<ol[^>]*>(.*?)</ol>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline)]
private static partial Regex HtmlOrderedListRegex();
[GeneratedRegex(@"<li[^>]*>(.*?)</li>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline)]
private static partial Regex HtmlListItemRegex();
/// <summary>
/// Renders the message content as plain text by removing formatting.
/// </summary>
/// <param name="content">The message content to format.</param>
/// <returns>Plain text representation of the content.</returns>
public string Format(IChatMessageContent content)
{
if (content is StringMessageContent stringContent)
return Format(stringContent.ToString());
throw new NotSupportedException($"Only {nameof(StringMessageContent)} is supported by {nameof(PlainTextMessageFormatter)}.");
}
/// <summary>
/// Renders the message content as plain text by removing formatting.
/// </summary>
/// <param name="content">The message content to format.</param>
/// <returns>Plain text representation of the content.</returns>
public string Format(string content)
{
var text = content ?? string.Empty;
// Convert HTML lists to plain text BEFORE removing other HTML tags
text = ConvertHtmlListsToPlainText(text);
// Replace Markdown code blocks with blank-line-separated content (process first to avoid interference with other patterns)
text = MarkdownCodeBlockRegex().Replace(text, match =>
{
var codeContent = match.Groups[1].Value;
// Trim trailing newline from code content to avoid double newlines
if (codeContent.EndsWith('\n'))
codeContent = codeContent[..^1];
return $"\n{codeContent}\n";
});
// Remove Markdown inline code (keep content)
text = MarkdownInlineCodeRegex().Replace(text, "$1");
// Remove Markdown images (keep alt text)
text = MarkdownImagesRegex().Replace(text, "$1");
// Remove Markdown links (keep link text)
text = MarkdownLinksRegex().Replace(text, "$1");
// Remove Markdown headers (process before bold/italic to avoid conflicts)
text = MarkdownHeadersRegex().Replace(text, string.Empty);
// Remove Markdown bold (process before italic to handle *** correctly)
text = MarkdownBoldRegex().Replace(text, "$2");
// Remove Markdown italic
text = MarkdownItalicRegex().Replace(text, "$2");
// Remove Markdown strikethrough
text = MarkdownStrikethroughRegex().Replace(text, "$1");
// Remove HTML tags (process last to catch any HTML in the original content)
text = HtmlTagsRegex().Replace(text, string.Empty);
return text.Trim();
}
private static string ConvertHtmlListsToPlainText(string text)
{
// We need to process nested lists from innermost to outermost
// So we keep replacing until no more <ul> or <ol> tags are found
var changed = true;
while (changed)
{
var originalText = text;
// Convert unordered lists <ul> to "- " prefixed items
text = HtmlUnorderedListRegex().Replace(text, match =>
{
var listContent = match.Groups[1].Value;
var items = HtmlListItemRegex().Matches(listContent);
var result = string.Join("\n", items.Cast<Match>().Select(m => $"- {m.Groups[1].Value.Trim()}"));
return result;
});
// Convert ordered lists <ol> to numbered items
text = HtmlOrderedListRegex().Replace(text, match =>
{
var listContent = match.Groups[1].Value;
var items = HtmlListItemRegex().Matches(listContent);
var result = string.Join("\n", items.Cast<Match>().Select((m, i) => $"{i + 1}. {m.Groups[1].Value.Trim()}"));
return result;
});
changed = text != originalText;
}
return text;
}
}