proper utf8 support for input words & better "cutting" using scyllabus cuts
This commit is contained in:
parent
3d88aa6b62
commit
2827ce3e82
15
autofeur_db/Cargo.lock
generated
15
autofeur_db/Cargo.lock
generated
|
@ -60,16 +60,19 @@ dependencies = [
|
|||
"hypher",
|
||||
"itertools",
|
||||
"kdam",
|
||||
"levenshtein",
|
||||
"querystring",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"strsim",
|
||||
"tokio",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"trie-rs",
|
||||
"unicode-segmentation",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -650,6 +653,12 @@ version = "1.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.139"
|
||||
|
@ -1239,6 +1248,12 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.107"
|
||||
|
|
|
@ -24,6 +24,9 @@ anyhow = "1.0.68"
|
|||
itertools = "0.10.5"
|
||||
querystring = "1.1.0"
|
||||
hypher = { version = "0.1", features = ["english", "french"] }
|
||||
levenshtein = "1.0.5"
|
||||
strsim = "0.11.1"
|
||||
url = "*"
|
||||
|
||||
[[bin]]
|
||||
name = "generate"
|
||||
|
|
|
@ -6,16 +6,7 @@ use std::collections::HashMap;
|
|||
use std::{fs, net::SocketAddr, sync::Arc};
|
||||
use tower::{make::Shared, ServiceBuilder};
|
||||
use tower_http::add_extension::AddExtensionLayer;
|
||||
|
||||
fn parse_query(query: &str) -> HashMap<String, String> {
|
||||
query
|
||||
.split('&')
|
||||
.filter_map(|s| {
|
||||
s.split_once('=')
|
||||
.and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
use url::form_urlencoded::parse;
|
||||
|
||||
fn anyhow_response(err: anyhow::Error) -> Response<Body> {
|
||||
Response::builder()
|
||||
|
@ -34,7 +25,18 @@ async fn handler(request: Request<Body>) -> Result<Response<Body>, hyper::Error>
|
|||
Ok(ok) => ok,
|
||||
Err(err) => return Ok(err),
|
||||
};
|
||||
let data = match parse_query(query)
|
||||
|
||||
let params: HashMap<String, String> = request
|
||||
.uri()
|
||||
.query()
|
||||
.map(|v| {
|
||||
url::form_urlencoded::parse(v.as_bytes())
|
||||
.into_owned()
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_else(HashMap::new);
|
||||
|
||||
let data = match params
|
||||
.get("grapheme")
|
||||
.ok_or_else(|| anyhow_response(anyhow!("grapheme argument is not specified")))
|
||||
{
|
||||
|
|
|
@ -20,6 +20,10 @@ async fn call_inference_service(word: &str) -> anyhow::Result<String> {
|
|||
impl Save<'_> {
|
||||
pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
|
||||
let phonemes = call_inference_service(prefix).await?;
|
||||
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French)
|
||||
.into_iter()
|
||||
.collect_vec();
|
||||
println!("syl: [{}]", source_word_syllabes.join(","));
|
||||
|
||||
let completion = self
|
||||
.trie
|
||||
|
@ -37,8 +41,9 @@ impl Save<'_> {
|
|||
|
||||
println!("Matching {} by adding {}", word, completion);
|
||||
|
||||
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French).into_iter().collect_vec();
|
||||
let source_word_syllabes: Vec<&str> = hyphenate(prefix, hypher::Lang::French).into_iter().collect_vec();
|
||||
let mut completed_syllabes: Vec<&str> = hyphenate(word, hypher::Lang::French)
|
||||
.into_iter()
|
||||
.collect_vec();
|
||||
|
||||
// input: test
|
||||
// output found: testames
|
||||
|
@ -46,18 +51,33 @@ impl Save<'_> {
|
|||
// output expect: tames
|
||||
// we just need to remove the prefix
|
||||
|
||||
|
||||
println!(
|
||||
"[{}] cmp [{}]",
|
||||
source_word_syllabes.join(","),
|
||||
completed_syllabes.join(",")
|
||||
);
|
||||
let mut i = 0;
|
||||
for (index, syl) in source_word_syllabes.iter().enumerate() {
|
||||
if *source_word_syllabes[index] == **syl {
|
||||
let maxindex = source_word_syllabes.len() - 1;
|
||||
for (index, syl) in completed_syllabes.iter().enumerate() {
|
||||
if maxindex < index {
|
||||
break;
|
||||
}
|
||||
|
||||
let phon1 = &source_word_syllabes[index].to_lowercase();
|
||||
let phon2 = &(**syl).to_lowercase();
|
||||
|
||||
println!("comparing syllab {} vs {}", phon1, phon2);
|
||||
|
||||
if strsim::levenshtein(phon1, phon2) < 2 {
|
||||
i = index
|
||||
} else {
|
||||
println!("found scyl break at {}", i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
completed_syllabes.drain(0..i); // we finally just need to compute the end of the word which matches the sound
|
||||
let found = completed_syllabes.join("");
|
||||
// we finally just need to compute the end of the word which matches the sound
|
||||
let found = completed_syllabes.drain(i+1..).join("");
|
||||
println!("{} is equivalent to {}", completion, found);
|
||||
|
||||
Ok(format!("{} ({})", found, word))
|
||||
|
|
|
@ -43,8 +43,6 @@ const sanitizeWord = (sentence) => {
|
|||
.trim()
|
||||
.split(" ")
|
||||
.slice(-1)[0]
|
||||
.normalize('NFKD')
|
||||
.replace(/[\u0300-\u036f]/g, "")
|
||||
.replaceAll(/(?:https?|ftp):\/\/[\n\S]+/g, "")
|
||||
.replaceAll(/\:([a-z]|[A-Z])+\:/g, "")
|
||||
.replaceAll(/(\?|\!|\.|\,|\;)/g, "")
|
||||
|
|
Loading…
Reference in a new issue