autofeur v3 - custom db

2023-01-20 12:59:08 +04:00 · 2023-01-20 12:59:08 +04:00 · 0865b57a15
parent a9077c56fb
commit 0865b57a15
29 changed files with 2487 additions and 218 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,74 @@
 # Autofeur

-Bot that reponds with end of words
+Bot that reponds with end of words
+
+## Structure
+
+Autofeur is composed of a few components that make up this bot
+
+|Name|Description|
+|-|-|
+|autofeur_nova|This is the component for handling discord events, it uses the [nova framework](https://github.com/discordnova/nova) under the hood and is developped with TypeScript|
+|deep_phonemizer|This is the component that transforms a grapheme into a phoneme using [DeepPhonemizer](https://github.com/as-ideas/DeepPhonemizer)|
+|autofeur_db|This is the component used for completing te end of the words, its a DB specialized into completing this specific task|
+
+## Running Autofeur
+
+### Getting trained models
+
+You'll need two files to get running with Autofeur, a trained `DeepPhonemizer` model and a IPA Dictionary file.
+
+You can get the `DeepPhonemizer` model on the project [github page](https://github.com/as-ideas/DeepPhonemizer#pretrained-models) or follow the instructions there to create your own datasets.
+
+You can get the IPA Dictionary on this [github page](https://github.com/open-dict-data/ipa-dict) or use your own, it's simply a CSV file with two columns, one for the word and another for the phonemized word.
+
+### Starting `deep_phonemizer`
+
+To run it inside docker, we recommand
+`docker-compose up deep-phonemizer`
+If you want to use bare metal, follow the following commands
+You'll need to move your trained model into the `deep_phonemizer/assets/model.pt` file.
+
+```sh
+# Go into the folder
+cd deep_phonemizer
+# Create a Virtual environment with dependencies
+python3 -m venv ./venv
+source ./venv/bin/activate
+pip install -r requirements.txt
+
+# Run the flash application
+flask run
+```
+
+### Starting `autofeur_db`
+
+#### Generating the database
+The autofeur DB needs to be pre-computed in order to deliver excellent performance.
+First of all, you to have your dictionary file in the `autofeur_db/assets/dictionary.csv` file.
+Only then you can start generating the DB.
+```sh
+cd autofeur_db
+cargo run --release --bin generate
+```
+
+This will output a `autofeur_db/assets/db.bin` which will be used for the db to complete words.
+
+### Starting the service
+To start `autofeur_db` you can simply use the docker-container `docker-compose up autofeur_db`
+or use the bare-metal command
+```sh
+cd autofeur_db
+cargo run --release --bin server
+```
+
+### Starting the nova components
+You'll need nova to use this bot, however setup is quite easy and only requires a configuration file
+you can find on the [GitHub's project](https://github.com/discordnova/nova) or use this project example config file located in `autofeur_nova/config/default.example.yml`.
+Your config file will need to be named `autofeur_nova/config/defauly.yml`.
+
+To start nova, you can either use the `all-in-one` binary or the full blown docker compose services
+to get started using the all in one binary, simply execute `yarn nova` in the `autofeur_nova/` folder. Or you can simply execute `docker-compose up nats redis cache gateway0 rest ratelimiter webhook` to start all nova components.
+
+### Starting `autofeur_nova`
+This component requires basically no configuration as it is configured in docker using environment variables and defaults work using localhost, you cant refer to the component readme to get the configuration environment variables available. `yarn start` or `docker-compose up autofeur_nova`
--- a/autofeur_db/.dockerignore
+++ b/autofeur_db/.dockerignore
@ -0,0 +1,2 @@
+target/
+assets/
--- a/autofeur_db/.gitignore
+++ b/autofeur_db/.gitignore
@ -0,0 +1,2 @@
+target/
+assets/
--- a/autofeur_db/Cargo.lock
+++ b/autofeur_db/Cargo.lock
--- a/autofeur_db/Cargo.toml
+++ b/autofeur_db/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+name = "autofeur"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+trie-rs = "0.1.1"
+csv = "1.1.6"
+unicode-segmentation = "1.10.0"
+rand = "0.8.5"
+serde = { version = "1.0.152", features = ["derive"] }
+serde_json = "1.0.91"
+bincode = "1.3.3"
+
+tower = { version = "0.4", features = ["full"] }
+tower-http = { version = "0.3.5", features = ["full"] }
+hyper = { version = "0.14.23", features = ["server"] }
+tokio = { version = "1.24.2", features = ["full"] }
+reqwest = "0.11.14"
+kdam = { version = "0.3", features = ["gradient", "template"] }
+anyhow = "1.0.68"
+itertools = "0.10.5"
+querystring = "1.1.0"
+levenshtein = "1.0.5"
+
+[[bin]]
+name = "generate"
+
+[[bin]]
+name = "server"
--- a/autofeur_db/Dockerfile
+++ b/autofeur_db/Dockerfile
@ -0,0 +1,21 @@
+FROM lukemathwalker/cargo-chef:latest-rust-1 AS chef
+WORKDIR /app
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder 
+COPY --from=planner /app/recipe.json recipe.json
+# Build dependencies - this is the caching Docker layer!
+RUN cargo chef cook --release --recipe-path recipe.json
+# Build application
+COPY . .
+RUN cargo build --release --bin server
+
+# We do not need the Rust toolchain to run the binary!
+FROM debian:buster-slim AS runtime
+WORKDIR /app
+RUN apt-get update && apt-get install -y ca-certificates libssl-dev
+COPY --from=builder /app/target/release/server /usr/local/bin/server
+ENTRYPOINT ["/usr/local/bin/server"]
--- a/autofeur_db/src/bin/generate.rs
+++ b/autofeur_db/src/bin/generate.rs
@ -0,0 +1,60 @@
+use std::fs;
+
+use autofeur::french_ipa::parse_word;
+use autofeur::save::Save;
+use kdam::tqdm;
+
+#[tokio::main]
+/// Generates the DB file foe easy usage.
+async fn main() {
+    let mut save = Save::default();
+
+    // Read from the
+    let mut vocabulary = csv::Reader::from_path("./assets/dictionary.csv").unwrap();
+    let mut phonems = vec![];
+
+    // Reduce all the records into the save index
+    // this is used to get all the phonemes represented in the csv
+    for record in tqdm!(
+        vocabulary.records(),
+        total = 245178,
+        colour = "gradient(#5A56E0,#EE6FF8)"
+    ) {
+        let record = record.unwrap();
+        let word = record.get(0).unwrap().to_string();
+        let mut pron: Vec<String> = record
+            .get(1)
+            .unwrap()
+            .split(',')
+            .map(|a| {
+                a.to_string()
+                    .trim()
+                    .replace("/", "")
+                    .replace("ʼ", "")
+                    .replace("ː", "")
+                    .replace(" ", "")
+                    .replace(".", "")
+            })
+            .collect();
+        for a in &pron {
+            save.reverse_index.insert(a.clone(), word.clone());
+        }
+        phonems.append(&mut pron);
+    }
+
+    let mut invalid = 0;
+    for phoneme in tqdm!(phonems.iter()) {
+        match parse_word(&phoneme) {
+            Some(a) => save.trie.insert(a),
+            None => {
+                invalid += 1;
+            }
+        }
+    }
+
+    println!("Invalid items count: {}", invalid);
+
+    fs::write("assets/db.bin", bincode::serialize(&save).unwrap()).unwrap();
+
+    println!("Generated to assets/db.bin");
+}
--- a/autofeur_db/src/bin/server.rs
+++ b/autofeur_db/src/bin/server.rs
@ -0,0 +1,55 @@
+use anyhow::anyhow;
+use autofeur::save::Save;
+use hyper::http::{Request, Response};
+use hyper::{server::Server, Body};
+use std::collections::HashMap;
+use std::{fs, net::SocketAddr, sync::Arc};
+use tower::{make::Shared, ServiceBuilder};
+use tower_http::add_extension::AddExtensionLayer;
+
+fn parse_query(query: &str) -> HashMap<String, String> {
+    query
+        .split('&')
+        .filter_map(|s| {
+            s.split_once('=')
+                .and_then(|t| Some((t.0.to_owned(), t.1.to_owned())))
+        })
+        .collect()
+}
+
+async fn handler(request: Request<Body>) -> Result<Response<Body>, anyhow::Error> {
+    let save: &Arc<Save> = request.extensions().get().unwrap();
+    let query = request
+        .uri()
+        .query()
+        .ok_or_else(|| anyhow!("query does not exists"))?;
+    let data = parse_query(query)
+        .get("grapheme")
+        .ok_or_else(|| anyhow!("grapheme argument is not specified"))?
+        .clone();
+
+    let infered = save
+        .inference(&data)
+        .await
+        .or_else(|_| Err(anyhow!("cannot find data")))?;
+
+    Ok(Response::builder().body(Body::from(infered)).unwrap())
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let checkpoint: Save = bincode::deserialize(&fs::read("assets/db.bin").unwrap()).unwrap();
+    let service = ServiceBuilder::new()
+        .layer(AddExtensionLayer::new(Arc::new(checkpoint)))
+        // Wrap a `Service` in our middleware stack
+        .service_fn(handler);
+
+    // And run our service using `hyper`
+    let addr = SocketAddr::from(([0, 0, 0, 0], 3000));
+    Server::bind(&addr)
+        .http1_only(true)
+        .serve(Shared::new(service))
+        .await
+        .expect("server error");
+    Ok(())
+}
--- a/autofeur_db/src/french_ipa.rs
+++ b/autofeur_db/src/french_ipa.rs
@ -0,0 +1,129 @@
+use std::hash::Hash;
+
+use unicode_segmentation::UnicodeSegmentation;
+
+macro_rules! ipa_element_to_number {
+    (@step $_idx:expr, $ident:ident,) => {
+        None
+    };
+
+    (@step $idx:expr, $ident:ident, $head:literal, $($tail:literal,)*) => {
+        if $ident == $head {
+            Some(Self($idx))
+        }
+        else {
+            ipa_element_to_number!(@step $idx + 1usize, $ident, $($tail,)*)
+        }
+    };
+}
+macro_rules! ipa_number_to_ipa {
+    (@step $_idx:expr, $ident:ident,) => {
+        "unreachable!()"
+    };
+
+    (@step $idx:expr, $ident:ident, $head:literal, $($tail:literal,)*) => {
+        if $ident == $idx {
+            $head
+        }
+        else {
+            ipa_number_to_ipa!(@step $idx + 1usize, $ident, $($tail,)*)
+        }
+    };
+}
+
+macro_rules! replace_expr {
+    ($_t:tt $sub:expr) => {
+        $sub
+    };
+}
+
+macro_rules! count_tts {
+    ($($tts:tt)*) => {0usize $(+ replace_expr!($tts 1usize))*};
+}
+
+macro_rules! ipa_map {
+    ($name:ident, $($l:literal),*) => {
+        use serde::{Deserialize, Serialize};
+        #[derive(Eq, Hash, PartialEq, Debug, Copy, Clone, Serialize, Deserialize)]
+        pub struct $name(pub usize);
+
+        impl $name {
+            pub const SIZE: usize = count_tts!($($l,)*);
+
+            pub fn from_char(ch: &str) -> Option<$name> {
+                ipa_element_to_number!(@step 0usize, ch, $($l,)*)
+            }
+            pub fn to_char(self) -> &'static str {
+                let num = self.0;
+                ipa_number_to_ipa!(@step 0usize, num, $($l,)*)
+            }
+        }
+    };
+}
+
+ipa_map!(
+    FrenchIPAChar,
+    "a",
+    "ɑ",
+    "ɑ̃",
+    "e",
+    "ɛ",
+    "ɛ̃",
+    "ə",
+    "i",
+    "o",
+    "ɔ",
+    "ɔ̃",
+    "œ",
+    "œ̃",
+    "ø",
+    "u",
+    "y",
+    "j",
+    "ɥ",
+    "w",
+    "b",
+    "d",
+    "f",
+    "g",
+    "k",
+    "l",
+    "m",
+    "n",
+    "ɲ",
+    "ŋ",
+    "p",
+    "ʁ",
+    "s",
+    "ʃ",
+    "t",
+    "v",
+    "z",
+    "ʒ",
+    "g",
+    "ɡ",
+    "ɪ",
+    "ʊ",
+    "x",
+    "r"
+);
+
+pub type FrenchIPAWord = Vec<FrenchIPAChar>;
+
+pub fn parse_word(str: &str) -> Option<FrenchIPAWord> {
+    let mut word = FrenchIPAWord::default();
+    let graphemes: Vec<&str> = str.graphemes(true).collect();
+    for (_, grapheme) in graphemes.iter().enumerate() {
+        let a = FrenchIPAChar::from_char(grapheme);
+
+        word.push(match a {
+            None => {
+                println!("invalid char: {}", grapheme);
+                return None;
+            }
+            Some(a) => a,
+        })
+    }
+
+    Some(word)
+}
--- a/autofeur_db/src/inference.rs
+++ b/autofeur_db/src/inference.rs
@ -0,0 +1,61 @@
+use std::{collections::VecDeque, env, ops::Add};
+
+use anyhow::anyhow;
+use itertools::Itertools;
+use levenshtein::levenshtein;
+use unicode_segmentation::UnicodeSegmentation;
+
+use crate::{french_ipa::parse_word, save::Save};
+
+async fn call_inference_service(word: &str) -> anyhow::Result<String> {
+    let server: Result<String, anyhow::Error> =
+        env::var("PHONEMIZER").or_else(|_| Ok("".to_string()));
+    Ok(
+        reqwest::get(format!("{}?grapheme={}", server.unwrap(), word))
+            .await?
+            .text()
+            .await?,
+    )
+}
+
+impl Save {
+    pub async fn inference(&self, prefix: &str) -> anyhow::Result<String> {
+        let phonemes = call_inference_service(prefix).await?;
+        let ipa_phonemes =
+            parse_word(&phonemes).ok_or_else(|| anyhow!("failed to parse the word"))?;
+
+        let completion = self
+            .trie
+            .random_starting_with(ipa_phonemes)
+            .ok_or_else(|| anyhow!("no matches"))?;
+
+        let infered = phonemes.add(&completion);
+        let word = self
+            .reverse_index
+            .get(&infered)
+            .ok_or_else(|| anyhow!("matched values is not in dictionary"))?;
+
+        println!("Matching {} by adding {}", word, completion);
+
+        // we finally just need to compute the end of the word which matches the sound
+        let mut found = None;
+
+        let mut characters: VecDeque<&str> = word.graphemes(true).collect();
+        while let Some(_) = characters.pop_front() {
+            let sub: String = characters.iter().join("");
+            let inference = call_inference_service(&sub).await?;
+
+            if levenshtein(&inference, &completion) < 2 {
+                found = Some(sub);
+                break;
+            } else {
+                println!("did not match a={}, b={}", inference, completion)
+            }
+        }
+
+        let found = found.ok_or_else(|| anyhow!("no prefix could be matched"))?;
+        println!("{} is equivalent to {}", completion, found);
+
+        Ok(format!("{} ({})", found, word))
+    }
+}
--- a/autofeur_db/src/lib.rs
+++ b/autofeur_db/src/lib.rs
@ -0,0 +1,4 @@
+pub mod trie;
+pub mod french_ipa;
+pub mod save;
+pub mod inference;
--- a/autofeur_db/src/save.rs
+++ b/autofeur_db/src/save.rs
@ -0,0 +1,11 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use crate::trie::Trie;
+
+#[derive(Debug, Deserialize, Serialize, Default)]
+pub struct Save {
+    pub trie: Trie,
+    pub reverse_index: HashMap<String, String>
+}
--- a/autofeur_db/src/trie.rs
+++ b/autofeur_db/src/trie.rs
@ -0,0 +1,169 @@
+use std::collections::HashMap;
+
+use rand::{thread_rng, Rng};
+use serde::{Deserialize, Serialize};
+
+use crate::french_ipa::{FrenchIPAChar, FrenchIPAWord};
+
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct TrieNode {
+    value: Option<FrenchIPAChar>,
+    is_final: bool,
+    child_nodes: HashMap<FrenchIPAChar, TrieNode>,
+    child_count: u64,
+}
+
+impl TrieNode {
+    // Create new node
+    pub fn new(c: FrenchIPAChar, is_final: bool) -> TrieNode {
+        TrieNode {
+            value: Option::Some(c),
+            is_final,
+            child_nodes: HashMap::with_capacity(FrenchIPAChar::SIZE),
+            child_count: 0,
+        }
+    }
+
+    pub fn new_root() -> TrieNode {
+        TrieNode {
+            value: Option::None,
+            is_final: false,
+            child_nodes: HashMap::with_capacity(FrenchIPAChar::SIZE),
+            child_count: 0,
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct Trie {
+    root_node: Box<TrieNode>,
+}
+
+impl Trie {
+    // Create a TrieStruct
+    pub fn new() -> Trie {
+        Trie {
+            root_node: Box::new(TrieNode::new_root()),
+        }
+    }
+
+    // Insert a string
+    pub fn insert(&mut self, char_list: FrenchIPAWord) {
+        let mut current_node: &mut TrieNode = self.root_node.as_mut();
+        let mut last_match = 0;
+
+        // Find the minimum existing math
+        for letter_counter in 0..char_list.len() {
+            if current_node
+                .child_nodes
+                .contains_key(&char_list[letter_counter])
+            {
+                current_node = current_node
+                    .child_nodes
+                    .get_mut(&char_list[letter_counter])
+                    .unwrap();
+                // we mark the node as containing our children.
+                current_node.child_count += 1;
+            } else {
+                last_match = letter_counter;
+                break;
+            }
+            last_match = letter_counter + 1;
+        }
+
+        // if we found an already exsting node
+        if last_match == char_list.len() {
+            current_node.is_final = true;
+        } else {
+            for new_counter in last_match..char_list.len() {
+                let key = char_list[new_counter];
+                current_node
+                    .child_nodes
+                    .insert(key, TrieNode::new(char_list[new_counter], false));
+                current_node = current_node.child_nodes.get_mut(&key).unwrap();
+                current_node.child_count += 1;
+            }
+            current_node.is_final = true;
+        }
+    }
+
+    // Find a string
+    pub fn random_starting_with(&self, prefix: FrenchIPAWord) -> Option<String> {
+        let mut current_node: &TrieNode = self.root_node.as_ref();
+        let mut str = String::new();
+        let mut i = prefix.len();
+        // Descend as far as possible into the tree
+        for counter in prefix {
+            if let Some(node) = current_node.child_nodes.get(&counter) {
+                current_node = node;
+                if let Some(value) = current_node.value {
+                    str += value.to_char();
+                    i -= 1;
+                }
+            } else {
+                // couldn't descend fully into the tree
+                return None;
+            }
+        }
+
+        println!("Found common root node {}", str);
+
+        // Ignore the 0-len matches
+        if i == 0 && current_node.child_nodes.len() == 0 {
+            println!("removing 0-len match");
+            return None;
+        }
+        str = String::new();
+
+        // now that we have the node we descend by respecting the probabilities
+        while current_node.child_nodes.len() != 0 && current_node.child_count > 0 {
+            println!("Descending into node {}", str);
+            let max = current_node.child_count;
+            let random_number = thread_rng().gen_range(0..max);
+            let mut increment = 0;
+
+            let mut did_change = false;
+            // find node corresponding to the node
+            for (_, node) in &current_node.child_nodes {
+                if node.child_count + increment >= random_number {
+                    println!("changing node");
+                    current_node = node;
+                    did_change = true;
+                    break;
+                } else {
+                    println!(
+                        "didn't change node: {}<{}",
+                        node.child_count + increment,
+                        random_number
+                    )
+                }
+                increment += node.child_count;
+            }
+            if did_change {
+                if let Some(value) = current_node.value {
+                    println!("added {}", value.to_char());
+                    str += value.to_char();
+                }
+            } else {
+                println!(
+                    "WARNING: DIDNT CHANGE NODE child_count={}",
+                    current_node.child_count
+                )
+            }
+            // if this node is a final node, we have a probability of using it
+            if current_node.is_final && current_node.child_count > 0 {
+                let random_number = thread_rng().gen_range(0..current_node.child_count);
+                if random_number == 0 {
+                    break;
+                }
+            }
+        }
+
+        if str == "" {
+            return None;
+        }
+
+        // selected word
+        Some(str)
+    }
+}
--- a/autofeur_nova/.dockerignore
+++ b/autofeur_nova/.dockerignore
@ -0,0 +1,3 @@
+config/
+dist/
+node_modules/
--- a/autofeur_nova/.gitignore
+++ b/autofeur_nova/.gitignore
@ -1,4 +1,5 @@
 bin/
 node_modules/
 dist/
-config/
+config/*
+!config/default.example.yml
--- a/autofeur_nova/Dockerfile
+++ b/autofeur_nova/Dockerfile
@ -2,28 +2,18 @@ FROM node as builder

 # Create app directory
 WORKDIR /usr/src/app
-
 # Install app dependencies
 COPY package*.json ./
-
 RUN npm i
-
 COPY . .
-
-RUN npm run build
+RUN npm run build || true

 FROM node:slim
-
 # Create app directory
 WORKDIR /usr/src/app
-
 # Install app dependencies
 COPY package*.json ./
-COPY data.json .
-
 RUN npm i --omit=dev --production
-
 COPY --from=builder /usr/src/app/dist ./dist

-EXPOSE 8080
-CMD [ "node", "dist/index.mjs" ]
+CMD [ "node", "dist/index.mjs" ]
--- a/autofeur_nova/config/default.example.yml
+++ b/autofeur_nova/config/default.example.yml
@ -0,0 +1,59 @@
+gateway:
+  token: # You need to fill this!
+  intents: 3276799
+  shard: 0
+  shard_total: 1
+
+rest:
+  discord:
+    token: # You need to fill this!
+  server:
+    listening_adress: 0.0.0.0:8090
+  ratelimiter_address: localhost # You need to change this to your ratelimit server address!
+  ratelimiter_port: 8092
+
+webhook:
+  discord:
+    public_key: # You need to fill this
+  server:
+    listening_adress: 0.0.0.0:8091
+
+cache:
+  toggles:
+    - channels_cache
+    - guilds_cache
+    - guild_schedules_cache
+    - stage_instances_cache
+    - integrations_cache
+    - members_cache
+    - bans_cache
+    - reactions_cache
+    - messages_cache
+    - threads_cache
+    - invites_cache
+    - roles_cache
+    - automoderation_cache
+    - voice_states_cache
+
+ratelimiter:
+  server:
+    listening_adress: 0.0.0.0:8092
+
+# Prometheus monitoring configuration
+monitoring:
+  enabled: false
+  address: 0.0.0.0
+  port: 9000
+
+# Nats broker configuration
+nats:
+  host: nats
+
+redis:
+  url: redis://redis
+
+#opentelemetry:
+ # metrics:
+ #   endpoint: http://otelcol:4317
+ # traces:
+ #   endpoint: http://otelcol:4317
--- a/autofeur_nova/data.json
+++ b/autofeur_nova/data.json
--- a/autofeur_nova/package.json
+++ b/autofeur_nova/package.json
@ -8,8 +8,8 @@
  "type": "module",
  "license": "Apache-2.0",
  "dependencies": {
-    "@discordnova/nova-cli": "0.0.4",
-    "@discordnova/nova-js": "^0.0.4",
+    "@discordnova/nova-cli": "0.0.5",
+    "@discordnova/nova-js": "0.0.5",
    "source-map-support": "^0.5.21",
    "tslib": "^2.4.1",
    "undici": "^5.15.0"
@ -17,7 +17,7 @@
  "devDependencies": {
    "@types/node": "^18.11.18",
    "discord-api-types": "^0.37.25",
-    "typescript": "^4.9.4",
+    "typescript": "^5.0.0-dev.20230120",
    "xo": "^0.53.1"
  },
  "scripts": {
--- a/autofeur_nova/src/algo.mts
+++ b/autofeur_nova/src/algo.mts
@ -1,65 +0,0 @@
-import { readFileSync } from "fs";
-import { request } from "undici";
-import { phonemize } from "./phonemizelib.mjs";
-
-let data: {
-  word: string;
-  phoneme: string;
-  partials: string[];
-}[] = JSON.parse(readFileSync("./data.json").toString("utf8"));
-
-const cutWord = (sentence: string) => {
-  let lastWord = sentence.split(" ").slice(-1)[0].replace(/(\?)/g, "");
-  return phonemize(lastWord);
-};
-
-export const match = async (sentence: string) => {
-  let scores: { complete: string; score: number }[] = [];
-  let sentenceWord = await cutWord(sentence);
-  console.debug("handling word phoneme = ", sentenceWord);
-
-  for (const { phoneme, word, partials } of data) {
-    console.debug("\ttesting with word = ", word, phoneme);
-
-    
-    for (let i = 1; i < phoneme.length; i++) {
-      // add n last characters from the phoneme
-      let add = phoneme.slice(phoneme.length - i, phoneme.length);
-      console.debug(
-        "\t\ttesting match with = ",
-        add,
-        " add = ",
-        sentenceWord + add
-      );
-
-      // we matched a phoneme
-      if (phoneme == sentenceWord + add) {
-        let score = 1 / (i / phoneme.length);
-
-        // next, we need to find the completion of the word
-        // this is relatively easy since we only need to 
-        let complete = partials[add];
-
-        if (!complete) {
-          // cannot find the comlpetion count.
-          // default to index
-          console.log("couldn't find corresponding cut", add);
-          complete = word;
-          continue;
-        }
-
-        console.log("\t\tmatched with score = ", score, " complete = ", complete);
-
-        // need to change to the cut-ed version.
-        scores.push({ score, complete });
-        break;
-      }
-    }
-  }
-
-  let resp = scores.sort((a, b) => b.score - a.score);
-  return resp[0]?.complete;
-};
-
-
-match("quoi");
--- a/autofeur_nova/src/index.mts
+++ b/autofeur_nova/src/index.mts
@ -4,31 +4,70 @@ import {
  RESTPostAPIChannelMessageJSONBody,
  Routes,
 } from "discord-api-types/v10";
-import { match } from "./algo.mjs";
 import { Client } from "@discordnova/nova-js/src/lib/client.js";
+import { request } from "undici";

+// `autofeur_db` service
+export const DB = process.env.DB || "http://localhost:3000";
+// nats broker for connecting to nova
 export const NATS = process.env.NATS || "localhost:4222";
+// rest endpoint for connecting to nova
 export const REST = process.env.REST || "http://localhost:8090/api";

-(async () => {
-  const emitter = new Client({
-    transport: {
-      additionalEvents: [],
-      nats: {
-        servers: [NATS],
-      },
-      queue: "nova-worker-common",
-    },
-    rest: {
-      api: REST,
-    },
-  });
+/**
+ * Completes a grapheme using the `autofeur_db` service.
+ * @param grapheme Grapheme to complete
+ * @returns Completed grapheme
+ */
+export const completeWord = (grapheme: string) =>
+  request(`${DB}?grapheme=${encodeURIComponent(grapheme)}`).then((x) =>
+    x.body.text()
+  );

-  emitter.on(
-    "messageCreate",
-    async (message: GatewayMessageCreateDispatch["d"]) => {
-      let response = await match(message.content);
-      if (response) {
+/**
+ * Cleans a sentence for usage with this program, strips unwanted chars
+ * @param sentence Raw discord sentence
+ * @returns The last word without any specials characters
+ */
+const cutWord = (sentence: string) => {
+  let lastWord = sentence
+    .split(" ")
+    .slice(-1)[0]
+    .replaceAll(/(\s)?([^\x41-\x5A\s^\x61-\x7A^\xC0-\xFF])/g, "");
+  return lastWord;
+};
+
+/**
+ * Nova client for receiving events
+ */
+const emitter = new Client({
+  transport: {
+    additionalEvents: [],
+    nats: {
+      servers: [NATS],
+    },
+    queue: "autofeur_nova",
+  },
+  rest: {
+    api: REST,
+  },
+});
+
+/**
+ * Handle the message creation event
+ */
+emitter.on(
+  "messageCreate",
+  async (message: GatewayMessageCreateDispatch["d"]) => {
+    // we shall not repond to bots
+    if (message.author.bot) return;
+    try {
+      // Get the completed word found by the db.
+      let response = await completeWord(cutWord(message.content));
+
+      // Ignore if there is no completion
+      if (response || response === "") {
+        // Respond to the message.
        await emitter.rest.post(Routes.channelMessages(message.channel_id), {
          body: {
            content: response,
@ -36,9 +75,9 @@ export const REST = process.env.REST || "http://localhost:8090/api";
          } as RESTPostAPIChannelMessageJSONBody,
        });
      }
-    }
-  );
+    } catch (e) {}
+  }
+);

-  // We connect ourselves to the nova nats broker.
-  await emitter.start();
-})();
+// Start the service (listening for events.)
+(async () => await emitter.start())();
--- a/autofeur_nova/src/phonemizelib.mts
+++ b/autofeur_nova/src/phonemizelib.mts
@ -1,7 +0,0 @@
-import { request } from "undici";
-
-export const PHONEMIZER = process.env.PHONEMIZER || "http://localhost:5000";
-export const phonemize = (grapheme: string) =>
-  request(`${PHONEMIZER}?grapheme=${encodeURIComponent(grapheme)}`).then((x) =>
-    x.body.text()
-  );
--- a/autofeur_nova/src/preprocess.mts
+++ b/autofeur_nova/src/preprocess.mts
@ -1,85 +0,0 @@
-import { writeFileSync } from "fs";
-import { request } from "undici";
-import { phonemize } from "./phonemizelib.mjs";
-
-let jsonData: {
-  word: string;
-  phoneme: string;
-  partials: Record<string, string>;
-}[] = [];
-
-let words: string[] = [
-  "ta mère",
-  "tapis",
-  "taper",
-  "taré",
-  "tabasser",
-  "tabouret",
-  "rigole",
-  "amène",
-  "atchoum",
-  "abracadabra",
-  "abeille",
-  "alibaba",
-  "arnaque",
-  "maison",
-  "nombril",
-  "lapin",
-  "ouistiti",
-  "wifi",
-  "wisky",
-  "renard",
-  "requin",
-  "repas",
-  "retard",
-  "coiffeur",
-  "coiffeuse",
-  "kirikou",
-  "kiri",
-  "western",
-  "un deux",
-  "deux trois",
-  "yoplait",
-  "avalanche",
-  "moisissure",
-  "moisson",
-  "moineau",
-  "école",
-  "commentaire",
-  "quantificateur",
-  "commandant",
-  "claire chazal",
-  "tornade",
-  "bottes",
-  "bonsoir pariiiss",
-  "courtois",
-  "facteur",
-  "gérard",
-  "quoidrilatère",
-  "pepe",
-  "surfeur",
-  "toilettes",
-  "lebron james",
-  "c'est de la merde",
-  "trois quatre",
-  "quatre cinq",
-  "cinq six",
-  "six sept",
-];
-
-(async () => {
-  for (const word of words) {
-    let phoneme = await phonemize(word);
-    let partials: Record<string, string> = {};
-
-    for (let i = 3; i <= word.length; i++) {
-      // add n last characters from the phoneme
-      let add = word.slice(word.length - i, word.length);
-      partials[await phonemize(add)] = add;
-    }
-
-    jsonData.push({ phoneme, word, partials });
-  }
-
-  writeFileSync("./data.json", JSON.stringify(jsonData));
-})();
--- a/autofeur_nova/yarn.lock
+++ b/autofeur_nova/yarn.lock
@ -72,17 +72,17 @@
    tslib "^2.4.1"
    ws "^8.11.0"

-"@discordnova/nova-cli@0.0.4":
-  version "0.0.4"
-  resolved "https://registry.yarnpkg.com/@discordnova/nova-cli/-/nova-cli-0.0.4.tgz#66583e349f14c8fafc6f4e9fd184d7bb481304c8"
-  integrity sha512-n+1+Nzc8tTgfT6f6+0E5ELfrj6b5vP73H5FQlnb7gbMFDkCJeHuDlZqxJuy9uax6bIQuMF+uJyan2lTdRF6Z7g==
+"@discordnova/nova-cli@0.0.5":
+  version "0.0.5"
+  resolved "https://registry.yarnpkg.com/@discordnova/nova-cli/-/nova-cli-0.0.5.tgz#9ad013bb25e3aa91795654cfa0ba8bbfb7f3b2fe"
+  integrity sha512-ielyAYo6cTxLT8CyEgDTGxOv9A3gRl3IQCvoETjyDdJrZGd4CJFwBTOB5Dl1tq8wYohsaxEywLb3UnKyTHsYaA==
  dependencies:
    undici "^5.15.0"

-"@discordnova/nova-js@^0.0.4":
-  version "0.0.4"
-  resolved "https://registry.yarnpkg.com/@discordnova/nova-js/-/nova-js-0.0.4.tgz#10f530d06f0ccd920491cb4881e2cc4d82e29ea1"
-  integrity sha512-6t23zVkHBzw4jFYkYYkhUbIFBGsQo1CL4xnvuq3oruCpEhVVt0jkkc7RhQB6EspfIiLFuhyfMjRdlKJ1YEpsQg==
+"@discordnova/nova-js@0.0.5":
+  version "0.0.5"
+  resolved "https://registry.yarnpkg.com/@discordnova/nova-js/-/nova-js-0.0.5.tgz#753e126696e789fdd1fda43b08a6a493fff4fc27"
+  integrity sha512-ok1G2czehvptn6ICZYUP5CSqPuRzvI8b+rNFsdEQOEn1G2hLRFvYTn21QzirFcEO5y1yOa9zOr7VnQ3o0HH2Cw==
  dependencies:
    "@discordjs/core" "^0.3.0"
    "@discordjs/rest" "^1.5.0"
@ -2734,11 +2734,16 @@ typed-array-length@^1.0.4:
    for-each "^0.3.3"
    is-typed-array "^1.1.9"

-typescript@^4.9.3, typescript@^4.9.4:
+typescript@^4.9.3:
  version "4.9.4"
  resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.9.4.tgz#a2a3d2756c079abda241d75f149df9d561091e78"
  integrity sha512-Uz+dTXYzxXXbsFpM86Wh3dKCxrQqUcVMxwU54orwlJjOpO3ao8L7j5lH+dWfTwgCwIuM9GQ2kvVotzYJMXTBZg==

+typescript@^5.0.0-dev.20230120:
+  version "5.0.0-dev.20230120"
+  resolved "https://registry.yarnpkg.com/typescript/-/typescript-5.0.0-dev.20230120.tgz#8b8872448cfb88296c7dc530dc4a60e138230c75"
+  integrity sha512-vgmInMYmRogavAvGVDd+gnrckZJPFbfBp2l7ubTRtZ4CDw9YeUu3121tclQy+0FlmzvhfPUvbJ/ZWEqjQTKGbw==
+
 unbox-primitive@^1.0.2:
  version "1.0.2"
  resolved "https://registry.yarnpkg.com/unbox-primitive/-/unbox-primitive-1.0.2.tgz#29032021057d5e6cdbd08c5129c226dff8ed6f9e"
--- a/deep_phonemizer/.dockerignore
+++ b/deep_phonemizer/.dockerignore
@ -1,2 +1,3 @@
 venv
-__pycache__
+__pycache__
+assets/
--- a/deep_phonemizer/.gitignore
+++ b/deep_phonemizer/.gitignore
@ -1,3 +1,3 @@
 venv/
 __pycache__/
-latin_ipa_forward.pt
+assets/
--- a/deep_phonemizer/Dockerfile
+++ b/deep_phonemizer/Dockerfile
@ -1,8 +1,7 @@
-
 FROM python:3.7.3-slim
 COPY requirements.txt /
 RUN apt-get update && apt-get install -y build-essential
 RUN pip3 install -r /requirements.txt
 COPY . /app
 WORKDIR /app
-ENTRYPOINT gunicorn app:app -w 2 --threads 2 -b 0.0.0.0:8000
+ENTRYPOINT gunicorn app:app -w 8 --threads 8 -b 0.0.0.0:8000
--- a/deep_phonemizer/app.py
+++ b/deep_phonemizer/app.py
@ -2,10 +2,18 @@ from dp.phonemizer import Phonemizer
 from flask import Flask
 from flask import request

-phonemizer = Phonemizer.from_checkpoint('latin_ipa_forward.pt')
+phonemizer = Phonemizer.from_checkpoint('assets/model.pt')
 app = Flask(__name__, instance_relative_config=True)

@app.route('/')
 def handle():
-    searchword = request.args.get('grapheme', '')
-    return phonemizer(searchword, lang = 'fr')
+    """
+    Simple route that handles the phonem to grapheme translation.
+    """
+    grapheme = request.args.get('grapheme')
+    if grapheme is None:
+        return "You are missing the 'grapheme' parameter", 400
+    lang = request.args.get('language')
+    if lang is None:
+        lang = 'fr'
+    return phonemizer(grapheme, lang = lang), 200
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -4,16 +4,27 @@ services:
    build: autofeur_nova
    restart: always
    depends_on:
-      - deep_phonemizer
+      - autofeur_db
      - nats
    environment:
      - NATS=nats
      - REST=http://rest:8090/api
+      - DB=http://autofeur_db:3000/
+  
+  autofeur_db:
+    build: autofeur_db
+    restart: always
+    depends_on:
+      - deep_phonemizer
+    environment:
      - PHONEMIZER=http://deep_phonemizer:8000/
+    volumes:
+      - ./autofeur_db/assets/db.bin:/app/assets/db.bin
  deep_phonemizer:
    build: deep_phonemizer
    restart: always
-
+    volumes:
+      - ./deep_phonemizer/assets/model.pt:/app/assets/model.pt
  nats:
    image: nats
    restart: always
@ -31,9 +42,11 @@ services:
      - nats
      - redis
  
-  gateway:
+  gateway0:
    image: ghcr.io/discordnova/nova/gateway
    restart: always
+    environment:
+      - NOVA__GATEWAY__SHARD=0
    volumes:
      - ./autofeur_nova/config/default.yml:/config/default.yml
    depends_on: