proper utf8 support for input words & better "cutting" using scyllabus cuts

This commit is contained in:
matthieu 2025-01-29 13:24:30 +04:00
parent 3d88aa6b62
commit 2827ce3e82
5 changed files with 58 additions and 20 deletions

15
autofeur_db/Cargo.lock generated
View file

@ -60,16 +60,19 @@ dependencies = [
"hypher", "hypher",
"itertools", "itertools",
"kdam", "kdam",
"levenshtein",
"querystring", "querystring",
"rand", "rand",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"strsim",
"tokio", "tokio",
"tower", "tower",
"tower-http", "tower-http",
"trie-rs", "trie-rs",
"unicode-segmentation", "unicode-segmentation",
"url",
] ]
[[package]] [[package]]
@ -650,6 +653,12 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "levenshtein"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.139" version = "0.2.139"
@ -1239,6 +1248,12 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.107" version = "1.0.107"

View file

@ -24,6 +24,9 @@ anyhow = "1.0.68"
itertools = "0.10.5" itertools = "0.10.5"
querystring = "1.1.0" querystring = "1.1.0"
hypher = { version = "0.1", features = ["english", "french"] } hypher = { version = "0.1", features = ["english", "french"] }
levenshtein = "1.0.5"
strsim = "0.11.1"
url = "*"
[[bin]] [[bin]]
name = "generate" name = "generate"

View file

@ -6,16 +6,7 @@ use std::collections::HashMap;
use std::{fs, net::SocketAddr, sync::Arc}; use std::{fs, net::SocketAddr, sync::Arc};
use tower::{make::Shared, ServiceBuilder}; use tower::{make::Shared, ServiceBuilder};
use tower_http::add_extension::AddExtensionLayer; use tower_http::add_extension::AddExtensionLayer;
use url::form_urlencoded::parse;
fn parse_query(query: &str) -> HashMap<String, String> {
query
.split('&')
.filter_map(|s| {
s.split_once('=')
.and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
})
.collect()
}
fn anyhow_response(err: anyhow::Error) -> Response<Body> { fn anyhow_response(err: anyhow::Error) -> Response<Body> {
Response::builder() Response::builder()
@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>
Ok(ok) => ok, Ok(ok) => ok,
Err(err) => return Ok(err), Err(err) => return Ok(err),
}; };
let data = match parse_query(query)
let params: HashMap<String, String> = request
.uri()
.query()
.map(|v| {
url::form_urlencoded::parse(v.as_bytes())
.into_owned()
.collect()
})
.unwrap_or_else(HashMap::new);
let data = match params
.get("grapheme") .get("grapheme")
.ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified"))) .ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
{ {

View file

@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {
impl Save<'_> { impl Save<'_> {
pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> { pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
let phonemes = call_inference_service(prefix).await?; let phonemes = call_inference_service(prefix).await?;
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French)
.into_iter()
.collect_vec();
println!("syl: [{}]", source_word_syllabes.join(","));
let completion = self let completion = self
.trie .trie
@ -37,8 +41,9 @@ impl Save<'_> {
println!("Matching {} by adding {}", word, completion); println!("Matching {} by adding {}", word, completion);
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec(); let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French)
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec(); .into_iter()
.collect_vec();
// input: test // input: test
// output found: testames // output found: testames
@ -46,18 +51,33 @@ impl Save<'_> {
// output expect: tames // output expect: tames
// we just need to remove the prefix // we just need to remove the prefix
println!(
"[{}] cmp [{}]",
source_word_syllabes.join(","),
completed_syllabes.join(",")
);
let mut i = 0; let mut i = 0;
for (index, syl) in source_word_syllabes.iter().enumerate() { let maxindex = source_word_syllabes.len() - 1;
if *source_word_syllabes[index] == **syl { for (index, syl) in completed_syllabes.iter().enumerate() {
if maxindex < index {
break;
}
let phon1 = &source_word_syllabes[index].to_lowercase();
let phon2 = &(**syl).to_lowercase();
println!("comparing syllab {} vs {}", phon1, phon2);
if strsim::levenshtein(phon1, phon2) < 2 {
i = index i = index
} else { } else {
println!("found scyl break at {}", i);
break; break;
} }
} }
completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound // we finally just need to compute the end of the word which matches the sound
let found = completed_syllabes.join(""); let found = completed_syllabes.drain(i+1..).join("");
println!("{} is equivalent to {}", completion, found); println!("{} is equivalent to {}", completion, found);
Ok(format!("{} ({})", found, word)) Ok(format!("{} ({})", found, word))

View file

@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => {
.trim() .trim()
.split(" ") .split(" ")
.slice(-1)[0] .slice(-1)[0]
.normalize('NFKD')
.replace(/[\u0300-\u036f]/g, "")
.replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "") .replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
.replaceAll(/\:([a-z]|[A-Z])+\:/g, "") .replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
.replaceAll(/(\?|\!|\.|\,|\;)/g, "") .replaceAll(/(\?|\!|\.|\,|\;)/g, "")