proper utf8 support for input words & better "cutting" using scyllabus cuts

This commit is contained in:
matthieu 2025-01-29 13:24:30 +04:00
parent 3d88aa6b62
commit 2827ce3e82
5 changed files with 58 additions and 20 deletions

15
autofeur_db/Cargo.lock generated
View file

@ -60,16 +60,19 @@ dependencies = [
"hypher",
"itertools",
"kdam",
"levenshtein",
"querystring",
"rand",
"reqwest",
"serde",
"serde_json",
"strsim",
"tokio",
"tower",
"tower-http",
"trie-rs",
"unicode-segmentation",
"url",
]
[[package]]
@ -650,6 +653,12 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "levenshtein"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
[[package]]
name = "libc"
version = "0.2.139"
@ -1239,6 +1248,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "1.0.107"

View file

@ -24,6 +24,9 @@ anyhow = "1.0.68"
itertools = "0.10.5"
querystring = "1.1.0"
hypher = { version = "0.1", features = ["english", "french"] }
levenshtein = "1.0.5"
strsim = "0.11.1"
url = "*"
[[bin]]
name = "generate"

View file

@ -6,16 +6,7 @@ use std::collections::HashMap;
use std::{fs, net::SocketAddr, sync::Arc};
use tower::{make::Shared, ServiceBuilder};
use tower_http::add_extension::AddExtensionLayer;
fn parse_query(query: &str) -> HashMap<String, String> {
query
.split('&')
.filter_map(|s| {
s.split_once('=')
.and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
})
.collect()
}
use url::form_urlencoded::parse;
fn anyhow_response(err: anyhow::Error) -> Response<Body> {
Response::builder()
@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>
Ok(ok) => ok,
Err(err) => return Ok(err),
};
let data = match parse_query(query)
let params: HashMap<String, String> = request
.uri()
.query()
.map(|v| {
url::form_urlencoded::parse(v.as_bytes())
.into_owned()
.collect()
})
.unwrap_or_else(HashMap::new);
let data = match params
.get("grapheme")
.ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
{

View file

@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {
impl Save<'_> {
pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
let phonemes = call_inference_service(prefix).await?;
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French)
.into_iter()
.collect_vec();
println!("syl: [{}]", source_word_syllabes.join(","));
let completion = self
.trie
@ -37,8 +41,9 @@ impl Save<'_> {
println!("Matching {} by adding {}", word, completion);
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec();
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec();
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French)
.into_iter()
.collect_vec();
// input: test
// output found: testames
@ -46,18 +51,33 @@ impl Save<'_> {
// output expect: tames
// we just need to remove the prefix
println!(
"[{}] cmp [{}]",
source_word_syllabes.join(","),
completed_syllabes.join(",")
);
let mut i = 0;
for (index, syl) in source_word_syllabes.iter().enumerate() {
if *source_word_syllabes[index] == **syl {
let maxindex = source_word_syllabes.len() - 1;
for (index, syl) in completed_syllabes.iter().enumerate() {
if maxindex < index {
break;
}
let phon1 = &source_word_syllabes[index].to_lowercase();
let phon2 = &(**syl).to_lowercase();
println!("comparing syllab {} vs {}", phon1, phon2);
if strsim::levenshtein(phon1, phon2) < 2 {
i = index
} else {
println!("found scyl break at {}", i);
break;
}
}
completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound
let found = completed_syllabes.join("");
// we finally just need to compute the end of the word which matches the sound
let found = completed_syllabes.drain(i+1..).join("");
println!("{} is equivalent to {}", completion, found);
Ok(format!("{} ({})", found, word))

View file

@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => {
.trim()
.split(" ")
.slice(-1)[0]
.normalize('NFKD')
.replace(/[\u0300-\u036f]/g, "")
.replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
.replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
.replaceAll(/(\?|\!|\.|\,|\;)/g, "")