fix(web_search): complete HTML entity decoding with numeric character references (#822)
* fix(web_search): complete HTML entity decoding with numeric character references The decode_html_entities function only handled 7 named entities and lacked support for decimal (&#NN;) and hex (&#xHH;) numeric character references, which are common in search engine result snippets. This caused garbled text. Replace the hard-coded replace chains with a regex-based decoder that handles: - All named entities (amp, lt, gt, quot, apos, nbsp, copy, reg, mdash, ndash, lsquo, rsquo, ldquo, rdquo, hellip) - Decimal numeric references (A → A) - Hex numeric references (A → A) - Unknown entities are passed through unchanged * style(web_search): apply rustfmt --------- Co-authored-by: Hunter Bown <hmbown@gmail.com>
This commit is contained in:
@@ -474,13 +474,53 @@ fn strip_html_tags(text: &str) -> String {
|
||||
}
|
||||
|
||||
fn decode_html_entities(text: &str) -> String {
|
||||
text.replace("&", "&")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("'", "'")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(" ", " ")
|
||||
use regex::Regex;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
static ENTITY_RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = ENTITY_RE.get_or_init(|| {
|
||||
Regex::new(r"&(?:#(\d+)|#x([0-9A-Fa-f]+)|([a-zA-Z]+));").expect("HTML entity regex")
|
||||
});
|
||||
|
||||
re.replace_all(text, |caps: ®ex::Captures| {
|
||||
if let Some(dec) = caps.get(1) {
|
||||
return dec
|
||||
.as_str()
|
||||
.parse::<u32>()
|
||||
.ok()
|
||||
.and_then(std::char::from_u32)
|
||||
.unwrap_or('\u{FFFD}')
|
||||
.to_string();
|
||||
}
|
||||
if let Some(hex) = caps.get(2) {
|
||||
return u32::from_str_radix(hex.as_str(), 16)
|
||||
.ok()
|
||||
.and_then(std::char::from_u32)
|
||||
.unwrap_or('\u{FFFD}')
|
||||
.to_string();
|
||||
}
|
||||
let named = caps.get(3).map(|m| m.as_str());
|
||||
match named {
|
||||
Some("amp") => "&",
|
||||
Some("lt") => "<",
|
||||
Some("gt") => ">",
|
||||
Some("quot") => "\"",
|
||||
Some("apos") => "'",
|
||||
Some("nbsp") => " ",
|
||||
Some("copy") => "\u{00A9}",
|
||||
Some("reg") => "\u{00AE}",
|
||||
Some("mdash") => "\u{2014}",
|
||||
Some("ndash") => "\u{2013}",
|
||||
Some("lsquo") => "\u{2018}",
|
||||
Some("rsquo") => "\u{2019}",
|
||||
Some("ldquo") => "\u{201C}",
|
||||
Some("rdquo") => "\u{201D}",
|
||||
Some("hellip") => "\u{2026}",
|
||||
_ => return caps.get(0).map(|m| m.as_str()).unwrap_or("").to_string(),
|
||||
}
|
||||
.to_string()
|
||||
})
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn url_encode(input: &str) -> String {
|
||||
@@ -524,9 +564,47 @@ fn extract_query_param(url: &str, key: &str) -> Option<String> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{extract_search_query, optional_search_max_results};
|
||||
use super::{decode_html_entities, extract_search_query, optional_search_max_results};
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_handles_named_entities() {
|
||||
assert_eq!(decode_html_entities("&"), "&");
|
||||
assert_eq!(decode_html_entities("<"), "<");
|
||||
assert_eq!(decode_html_entities(">"), ">");
|
||||
assert_eq!(decode_html_entities("""), "\"");
|
||||
assert_eq!(decode_html_entities("'"), "'");
|
||||
assert_eq!(decode_html_entities(" "), " ");
|
||||
assert_eq!(decode_html_entities("©"), "\u{00A9}");
|
||||
assert_eq!(decode_html_entities("—"), "\u{2014}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_handles_decimal_numeric_references() {
|
||||
assert_eq!(decode_html_entities("A"), "A");
|
||||
assert_eq!(decode_html_entities("<"), "<");
|
||||
assert_eq!(decode_html_entities("–"), "\u{2013}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_handles_hex_numeric_references() {
|
||||
assert_eq!(decode_html_entities("A"), "A");
|
||||
assert_eq!(decode_html_entities("<"), "<");
|
||||
assert_eq!(decode_html_entities("—"), "\u{2014}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_passthrough_unknown() {
|
||||
assert_eq!(decode_html_entities("&unknown;"), "&unknown;");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_html_entities_mixed_content() {
|
||||
let input = "Hello & welcome to "Rust's world" — enjoy!";
|
||||
let expected = "Hello & welcome to \"Rust's world\" \u{2014} enjoy!";
|
||||
assert_eq!(decode_html_entities(input), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_search_query_accepts_legacy_query() {
|
||||
let query =
|
||||
|
||||
Reference in New Issue
Block a user