proper utf8 support for input words & better "cutting" using scyllabus cuts
This commit is contained in:
parent
3d88aa6b62
commit
2827ce3e82
15
autofeur_db/Cargo.lock
generated
15
autofeur_db/Cargo.lock
generated
|
@ -60,16 +60,19 @@ dependencies = [
|
||||||
"hypher",
|
"hypher",
|
||||||
"itertools",
|
"itertools",
|
||||||
"kdam",
|
"kdam",
|
||||||
|
"levenshtein",
|
||||||
"querystring",
|
"querystring",
|
||||||
"rand",
|
"rand",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"strsim",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower",
|
"tower",
|
||||||
"tower-http",
|
"tower-http",
|
||||||
"trie-rs",
|
"trie-rs",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -650,6 +653,12 @@ version = "1.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "levenshtein"
|
||||||
|
version = "1.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.139"
|
version = "0.2.139"
|
||||||
|
@ -1239,6 +1248,12 @@ dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "1.0.107"
|
version = "1.0.107"
|
||||||
|
|
|
@ -24,6 +24,9 @@ anyhow = "1.0.68"
|
||||||
itertools = "0.10.5"
|
itertools = "0.10.5"
|
||||||
querystring = "1.1.0"
|
querystring = "1.1.0"
|
||||||
hypher = { version = "0.1", features = ["english", "french"] }
|
hypher = { version = "0.1", features = ["english", "french"] }
|
||||||
|
levenshtein = "1.0.5"
|
||||||
|
strsim = "0.11.1"
|
||||||
|
url = "*"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "generate"
|
name = "generate"
|
||||||
|
|
|
@ -6,16 +6,7 @@ use std::collections::HashMap;
|
||||||
use std::{fs, net::SocketAddr, sync::Arc};
|
use std::{fs, net::SocketAddr, sync::Arc};
|
||||||
use tower::{make::Shared, ServiceBuilder};
|
use tower::{make::Shared, ServiceBuilder};
|
||||||
use tower_http::add_extension::AddExtensionLayer;
|
use tower_http::add_extension::AddExtensionLayer;
|
||||||
|
use url::form_urlencoded::parse;
|
||||||
fn parse_query(query: &str) -> HashMap<String, String> {
|
|
||||||
query
|
|
||||||
.split('&')
|
|
||||||
.filter_map(|s| {
|
|
||||||
s.split_once('=')
|
|
||||||
.and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn anyhow_response(err: anyhow::Error) -> Response<Body> {
|
fn anyhow_response(err: anyhow::Error) -> Response<Body> {
|
||||||
Response::builder()
|
Response::builder()
|
||||||
|
@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>
|
||||||
Ok(ok) => ok,
|
Ok(ok) => ok,
|
||||||
Err(err) => return Ok(err),
|
Err(err) => return Ok(err),
|
||||||
};
|
};
|
||||||
let data = match parse_query(query)
|
|
||||||
|
let params: HashMap<String, String> = request
|
||||||
|
.uri()
|
||||||
|
.query()
|
||||||
|
.map(|v| {
|
||||||
|
url::form_urlencoded::parse(v.as_bytes())
|
||||||
|
.into_owned()
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_else(HashMap::new);
|
||||||
|
|
||||||
|
let data = match params
|
||||||
.get("grapheme")
|
.get("grapheme")
|
||||||
.ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
|
.ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
|
||||||
{
|
{
|
||||||
|
|
|
@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {
|
||||||
impl Save<'_> {
|
impl Save<'_> {
|
||||||
pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
|
pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
|
||||||
let phonemes = call_inference_service(prefix).await?;
|
let phonemes = call_inference_service(prefix).await?;
|
||||||
|
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French)
|
||||||
|
.into_iter()
|
||||||
|
.collect_vec();
|
||||||
|
println!("syl: [{}]", source_word_syllabes.join(","));
|
||||||
|
|
||||||
let completion = self
|
let completion = self
|
||||||
.trie
|
.trie
|
||||||
|
@ -37,8 +41,9 @@ impl Save<'_> {
|
||||||
|
|
||||||
println!("Matching {} by adding {}", word, completion);
|
println!("Matching {} by adding {}", word, completion);
|
||||||
|
|
||||||
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec();
|
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French)
|
||||||
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec();
|
.into_iter()
|
||||||
|
.collect_vec();
|
||||||
|
|
||||||
// input: test
|
// input: test
|
||||||
// output found: testames
|
// output found: testames
|
||||||
|
@ -46,18 +51,33 @@ impl Save<'_> {
|
||||||
// output expect: tames
|
// output expect: tames
|
||||||
// we just need to remove the prefix
|
// we just need to remove the prefix
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"[{}] cmp [{}]",
|
||||||
|
source_word_syllabes.join(","),
|
||||||
|
completed_syllabes.join(",")
|
||||||
|
);
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
for (index, syl) in source_word_syllabes.iter().enumerate() {
|
let maxindex = source_word_syllabes.len() - 1;
|
||||||
if *source_word_syllabes[index] == **syl {
|
for (index, syl) in completed_syllabes.iter().enumerate() {
|
||||||
|
if maxindex < index {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let phon1 = &source_word_syllabes[index].to_lowercase();
|
||||||
|
let phon2 = &(**syl).to_lowercase();
|
||||||
|
|
||||||
|
println!("comparing syllab {} vs {}", phon1, phon2);
|
||||||
|
|
||||||
|
if strsim::levenshtein(phon1, phon2) < 2 {
|
||||||
i = index
|
i = index
|
||||||
} else {
|
} else {
|
||||||
|
println!("found scyl break at {}", i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound
|
// we finally just need to compute the end of the word which matches the sound
|
||||||
let found = completed_syllabes.join("");
|
let found = completed_syllabes.drain(i+1..).join("");
|
||||||
println!("{} is equivalent to {}", completion, found);
|
println!("{} is equivalent to {}", completion, found);
|
||||||
|
|
||||||
Ok(format!("{} ({})", found, word))
|
Ok(format!("{} ({})", found, word))
|
||||||
|
|
|
@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => {
|
||||||
.trim()
|
.trim()
|
||||||
.split(" ")
|
.split(" ")
|
||||||
.slice(-1)[0]
|
.slice(-1)[0]
|
||||||
.normalize('NFKD')
|
|
||||||
.replace(/[\u0300-\u036f]/g, "")
|
|
||||||
.replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
|
.replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
|
||||||
.replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
|
.replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
|
||||||
.replaceAll(/(\?|\!|\.|\,|\;)/g, "")
|
.replaceAll(/(\?|\!|\.|\,|\;)/g, "")
|
||||||
|
|
Loading…
Reference in a new issue