From c60b5980d3d5d5033744b98c675ba82d54dc7cd9 Mon Sep 17 00:00:00 2001 From: Bulat Kurbanov Date: Thu, 16 Mar 2023 21:12:27 +0100 Subject: [PATCH] Update fix_annotation_text --- Cargo.lock | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 2 + src/utils.rs | 19 +++--- 3 files changed, 171 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25c1f2c..b2b6e1f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,19 @@ dependencies = [ "memchr", ] +[[package]] +name = "ammonia" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e6d1c7838db705c9b756557ee27c384ce695a1c51a6fe528784cb1c6840170" +dependencies = [ + "html5ever", + "maplit", + "once_cell", + "tendril", + "url", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -472,6 +485,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.27" @@ -657,6 +680,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "http" version = "0.2.9" @@ -840,6 +877,7 @@ checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" name = "library_updater" version = "0.1.0" dependencies = [ + "ammonia", "async-compression", "async-trait", "axum", @@ -849,6 +887,7 @@ dependencies = [ "futures", "lazy_static", "log", + "maplit", "reqwest", "sentry", "serde", @@ -894,6 +933,32 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -972,6 +1037,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "nom" version = "7.1.3" @@ -1111,13 +1182,51 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + [[package]] name = "phf" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" dependencies = [ - "phf_shared", + "phf_shared 0.11.1", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", ] [[package]] @@ -1203,6 +1312,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.52" @@ -1587,6 +1702,32 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "741682811f723ed69fd5640c56a4aa91e2dea768c3acf899da7155d64fa27086" +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared 0.10.0", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -1633,6 +1774,17 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.2.0" @@ -1786,7 +1938,7 @@ dependencies = [ "log", "parking_lot", "percent-encoding", - "phf", + "phf 0.11.1", "pin-project-lite", "postgres-protocol", "postgres-types", @@ -1950,6 +2102,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "uuid" version = "1.3.0" diff --git a/Cargo.toml b/Cargo.toml index 0adbb4a..cfa8ca3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,5 @@ serde = { version = "1.0.144", features = ["derive"] } serde_json = "1.0.85" tokio-cron-scheduler = "0.8.1" axum = "0.5.16" +ammonia = "3" +maplit = "1.0.2" diff --git a/src/utils.rs b/src/utils.rs index 3b486ba..e43f0c4 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,8 @@ use std::fs::File; use std::io::{self, BufRead}; use std::path::Path; +use ammonia::Builder; +use maplit::hashset; pub fn read_lines

(filename: P) -> io::Result>> where @@ -23,14 +25,11 @@ pub fn parse_lang(s: &str) -> String { } pub fn fix_annotation_text(text: &str) -> String { - text.replace(" ", "") - .replace("[b]", "") - .replace("[/b]", "") - .replace("[hr]", "") - .replace("\\\"", "\"") - .replace("\\'", "'") - .replace("

", "") - .replace("

", "") - .replace("
", "\n") - .replace("\\n", "\n") + let temp_text = text.replace("
", "\n").replace("\\n", "\n"); + + let tags = hashset!["a"]; + Builder::new() + .tags(tags) + .clean(&temp_text) + .to_string() }