Nkrane: Offline-First English-Twi Translator

Fast, offline neural translation for English to Twi using CTranslate2. Optimized for CPU with minimal resource usage.

Demo

Try it with the web interface: https://huggingface.co/spaces/ghananlpcommunity/nkrane-translator

Installation

pip install ctranslate2 transformers spacy pandas
python -m spacy download en_core_web_sm

Translation Modes

1. Standard Mode

Direct neural translation - the model handles everything.

import ctranslate2
from transformers import AutoTokenizer
from huggingface_hub import snapshot_download
import spacy

# Load spaCy for sentence segmentation
nlp = spacy.load("en_core_web_sm")

# Download model
model_path = snapshot_download(repo_id="ghananlpcommunity/nkrane_ct2")

# Load model and tokenizer - using the same approach as your working app
model = ctranslate2.Translator(model_path, device="cpu")
tokenizer = AutoTokenizer.from_pretrained(model_path)

def split_into_batches(text, max_chars=600):
    """Split long text at sentence boundaries - simplified version based on your working app."""
    if len(text) <= max_chars:
        return [text]
    
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    batches = []
    current = ""
    
    for sentence in sentences:
        # If adding this sentence would exceed max_chars, start new batch
        if current and (len(current) + len(sentence) + 1 > max_chars):
            batches.append(current.strip())
            current = sentence
        else:
            if current:
                current += " " + sentence
            else:
                current = sentence
    
    # Add the last batch
    if current:
        batches.append(current.strip())
    
    return batches

def translate_standard(text):
    """Translate with automatic batching - fixed version."""
    batches = split_into_batches(text)
    translations = []
    
    for batch in batches:
        # Set source language
        tokenizer.src_lang = "eng_Latn"
        
        # Tokenize
        input_ids = tokenizer.encode(batch, add_special_tokens=True)
        source = tokenizer.convert_ids_to_tokens(input_ids)
        
        # Translate with simplified parameters (matching your working app)
        results = model.translate_batch(
            [source],
            beam_size=5,
            max_decoding_length=512
        )
        
        # Decode result
        output_tokens = results[0].hypotheses[0]
        translation = tokenizer.decode(
            tokenizer.convert_tokens_to_ids(output_tokens), 
            skip_special_tokens=True
        )
        translations.append(translation)
    
    return " ".join(translations)

# Test with your example
text = "Joy FM is a privately owned radio station in Accra, the capital of Ghana. The station is owned and run by the media group company Multimedia Group Limited. It is arguably the leading radio station in Ghana that broadcasts in the English language."
print("Input text:")
print(text)
print("\nTranslation:")
print(translate_standard(text))

2. Hybrid Mode

Combines neural translation with CSV dictionary for precise noun phrase control.

How it works:

Extract noun phrases from English text
Replace dictionary matches with placeholders (e.g., <1>, <2>)
Translate placeholder text with neural model
Substitute placeholders with Twi translations from CSV

Dictionary Format: The model includes a default CSV (translation_dict.csv) with English-Twi pairs:

english,twi
house,ofie
car,ntentan
school,sukuu
water,nsu

You can use your own CSV at any location - just update the path.

import spacy
import pandas as pd
import re
import ctranslate2
from transformers import AutoTokenizer
from huggingface_hub import snapshot_download, hf_hub_download

# Setup
nlp = spacy.load("en_core_web_sm")
STOPWORDS = nlp.Defaults.stop_words

# Load model - using the successful approach
model_path = snapshot_download(repo_id="ghananlpcommunity/nkrane_ct2")
model = ctranslate2.Translator(model_path, device="cpu")  # Removed compute_type parameter
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load CSV dictionary
csv_path = hf_hub_download(repo_id="ghananlpcommunity/nkrane_ct2", filename="translation_dict.csv")
df = pd.read_csv(csv_path)
noun_dict = dict(zip(df['english'].str.lower().str.strip(), df['twi'].str.strip()))

def remove_stopwords(phrase):
    """Remove stopwords from a phrase."""
    doc = nlp(phrase) if isinstance(phrase, str) else phrase
    cleaned = [token.text for token in doc if token.text.lower() not in STOPWORDS]
    return ' '.join(cleaned).strip()

def split_into_batches(text, max_chars=600):
    """Split long text at sentence boundaries - simplified version."""
    if len(text) <= max_chars:
        return [text]
    
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    batches = []
    current = ""
    
    for sentence in sentences:
        # If adding this sentence would exceed max_chars, start new batch
        if current and (len(current) + len(sentence) + 1 > max_chars):
            batches.append(current.strip())
            current = sentence
        else:
            if current:
                current += " " + sentence
            else:
                current = sentence
    
    # Add the last batch
    if current:
        batches.append(current.strip())
    
    return batches

def extract_noun_phrases(text):
    """Extract noun phrases from text."""
    doc = nlp(text)
    noun_phrases = []
    
    # Extract noun chunks
    for chunk in doc.noun_chunks:
        noun_phrases.append({
            'text': chunk.text,
            'start': chunk.start_char,
            'end': chunk.end_char
        })
    
    # Add standalone proper nouns
    covered = set()
    for np in noun_phrases:
        for i in range(np['start'], np['end']):
            covered.add(i)
    
    for token in doc:
        if token.pos_ == "PROPN" and token.idx not in covered:
            noun_phrases.append({
                'text': token.text,
                'start': token.idx,
                'end': token.idx + len(token.text)
            })
    
    # Remove overlaps (keep longer phrases)
    noun_phrases.sort(key=lambda x: (x['start'], -(x['end'] - x['start'])))
    filtered = []
    last_end = -1
    
    for np in noun_phrases:
        if np['start'] >= last_end:
            filtered.append(np)
            last_end = np['end']
    
    return filtered

def filter_dict_phrases(noun_phrases, dictionary):
    """Filter phrases to only those in dictionary."""
    filtered = []
    for phrase in noun_phrases:
        phrase_lower = phrase['text'].lower().strip()
        cleaned = remove_stopwords(phrase['text']).lower().strip()
        if phrase_lower in dictionary or (cleaned and cleaned in dictionary):
            filtered.append(phrase)
    return filtered

def create_placeholders(text, noun_phrases):
    """Replace content words in noun phrases with placeholders."""
    if not noun_phrases:
        return text, {}
    
    sorted_phrases = sorted(noun_phrases, key=lambda x: x['start'])
    phrase_mapping = {}
    result_parts = []
    last_end = 0
    
    for idx, phrase in enumerate(sorted_phrases, 1):
        placeholder = f"<{idx}>"
        phrase_text = phrase['text']
        phrase_doc = nlp(phrase_text)
        
        # Split into leading stopwords, content, trailing stopwords
        leading_stop = []
        content = []
        trailing_stop = []
        content_started = False
        content_ended = False
        
        for token in phrase_doc:
            is_stop = token.text.lower() in STOPWORDS
            
            if not content_started and is_stop:
                leading_stop.append(token.text_with_ws)
            elif not content_ended:
                content_started = True
                if is_stop:
                    remaining = phrase_doc[token.i + 1:]
                    if any(t.text.lower() not in STOPWORDS for t in remaining):
                        content.append(token.text_with_ws)
                    else:
                        content_ended = True
                        trailing_stop.append(token.text_with_ws)
                else:
                    content.append(token.text_with_ws)
            else:
                trailing_stop.append(token.text_with_ws)
        
        content_text = ''.join(content).strip()
        phrase_mapping[placeholder] = content_text
        
        replacement = ''.join(leading_stop) + placeholder + ''.join(trailing_stop)
        result_parts.append(text[last_end:phrase['start']])
        result_parts.append(replacement)
        last_end = phrase['end']
    
    result_parts.append(text[last_end:])
    return ''.join(result_parts), phrase_mapping

def lookup_phrases(phrase_mapping, dictionary):
    """Look up phrases in dictionary."""
    translation_mapping = {}
    
    for placeholder, phrase_text in phrase_mapping.items():
        phrase_lower = phrase_text.lower().strip()
        
        if phrase_lower in dictionary:
            translation_mapping[placeholder] = dictionary[phrase_lower]
        else:
            cleaned = remove_stopwords(phrase_text).lower().strip()
            if cleaned and cleaned in dictionary:
                translation_mapping[placeholder] = dictionary[cleaned]
            else:
                translation_mapping[placeholder] = phrase_text  # Keep original if no match
    
    return translation_mapping

def translate_with_model(text):
    """Translate text using CTranslate2 model - simplified version."""
    tokenizer.src_lang = "eng_Latn"
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    source = tokenizer.convert_ids_to_tokens(input_ids)
    
    # Simplified translation parameters (matching working app)
    results = model.translate_batch(
        [source],
        beam_size=5,
        max_decoding_length=512
    )
    
    output_tokens = results[0].hypotheses[0]
    translation = tokenizer.decode(
        tokenizer.convert_tokens_to_ids(output_tokens), 
        skip_special_tokens=True
    )
    
    return translation

def replace_placeholders(translated_text, translation_mapping):
    """Replace placeholders with Twi translations."""
    result = translated_text
    sorted_items = sorted(
        translation_mapping.items(), 
        key=lambda x: int(re.search(r'\d+', x[0]).group())
    )
    for placeholder, twi_phrase in sorted_items:
        result = result.replace(placeholder, twi_phrase)
    return result

def translate_hybrid(text):
    """Main hybrid translation function with batching."""
    if not text.strip():
        return "Please enter text to translate."
    
    # Split into batches if needed
    batches = split_into_batches(text)
    translations = []
    
    for i, batch in enumerate(batches):
        print(f"Processing batch {i+1}/{len(batches)}: {batch[:50]}...")
        
        # Extract noun phrases
        all_phrases = extract_noun_phrases(batch)
        noun_phrases = filter_dict_phrases(all_phrases, noun_dict)
        
        # Create placeholders
        placeholder_text, phrase_mapping = create_placeholders(batch, noun_phrases)
        
        # Look up dictionary translations
        translation_mapping = lookup_phrases(phrase_mapping, noun_dict)
        
        # Translate with model
        model_output = translate_with_model(placeholder_text)
        
        # Replace placeholders with Twi
        final_translation = replace_placeholders(model_output, translation_mapping)
        translations.append(final_translation)
    
    return " ".join(translations)

# Test the updated code
text = "Joy FM is a privately owned radio station in Accra, the capital of Ghana. The station is owned and run by the media group company Multimedia Group Limited. It is arguably the leading radio station in Ghana that broadcasts in the English language."
print("Input text:")
print(text)
print("\nHybrid translation:")
result = translate_hybrid(text)
print(result)

GPU Support

For faster inference on GPU:

model = ctranslate2.Translator(model_path, device="cuda", compute_type="int8")

License

Apache 2.0

Citation

@misc{nkrane_2025,
  title={Nkrane: English-to-Twi Neural Machine Translation},
  author={Ghana NLP Community},
  year={2025},
  howpublished={https://huggingface.co/ghananlpcommunity/nkrane}
}

Downloads last month: 53