Nkrane: Offline-First English-Twi Translator
Fast, offline neural translation for English to Twi using CTranslate2. Optimized for CPU with minimal resource usage.
Demo
Try it with the web interface: https://huggingface.co/spaces/ghananlpcommunity/nkrane-translator
Installation
pip install ctranslate2 transformers spacy pandas
python -m spacy download en_core_web_sm
Translation Modes
1. Standard Mode
Direct neural translation - the model handles everything.
import ctranslate2
from transformers import AutoTokenizer
from huggingface_hub import snapshot_download
import spacy
# Load spaCy for sentence segmentation
nlp = spacy.load("en_core_web_sm")
# Download model
model_path = snapshot_download(repo_id="ghananlpcommunity/nkrane_ct2")
# Load model and tokenizer - using the same approach as your working app
model = ctranslate2.Translator(model_path, device="cpu")
tokenizer = AutoTokenizer.from_pretrained(model_path)
def split_into_batches(text, max_chars=600):
"""Split long text at sentence boundaries - simplified version based on your working app."""
if len(text) <= max_chars:
return [text]
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
batches = []
current = ""
for sentence in sentences:
# If adding this sentence would exceed max_chars, start new batch
if current and (len(current) + len(sentence) + 1 > max_chars):
batches.append(current.strip())
current = sentence
else:
if current:
current += " " + sentence
else:
current = sentence
# Add the last batch
if current:
batches.append(current.strip())
return batches
def translate_standard(text):
"""Translate with automatic batching - fixed version."""
batches = split_into_batches(text)
translations = []
for batch in batches:
# Set source language
tokenizer.src_lang = "eng_Latn"
# Tokenize
input_ids = tokenizer.encode(batch, add_special_tokens=True)
source = tokenizer.convert_ids_to_tokens(input_ids)
# Translate with simplified parameters (matching your working app)
results = model.translate_batch(
[source],
beam_size=5,
max_decoding_length=512
)
# Decode result
output_tokens = results[0].hypotheses[0]
translation = tokenizer.decode(
tokenizer.convert_tokens_to_ids(output_tokens),
skip_special_tokens=True
)
translations.append(translation)
return " ".join(translations)
# Test with your example
text = "Joy FM is a privately owned radio station in Accra, the capital of Ghana. The station is owned and run by the media group company Multimedia Group Limited. It is arguably the leading radio station in Ghana that broadcasts in the English language."
print("Input text:")
print(text)
print("\nTranslation:")
print(translate_standard(text))
2. Hybrid Mode
Combines neural translation with CSV dictionary for precise noun phrase control.
How it works:
- Extract noun phrases from English text
- Replace dictionary matches with placeholders (e.g.,
<1>,<2>) - Translate placeholder text with neural model
- Substitute placeholders with Twi translations from CSV
Dictionary Format:
The model includes a default CSV (translation_dict.csv) with English-Twi pairs:
english,twi
house,ofie
car,ntentan
school,sukuu
water,nsu
You can use your own CSV at any location - just update the path.
import spacy
import pandas as pd
import re
import ctranslate2
from transformers import AutoTokenizer
from huggingface_hub import snapshot_download, hf_hub_download
# Setup
nlp = spacy.load("en_core_web_sm")
STOPWORDS = nlp.Defaults.stop_words
# Load model - using the successful approach
model_path = snapshot_download(repo_id="ghananlpcommunity/nkrane_ct2")
model = ctranslate2.Translator(model_path, device="cpu") # Removed compute_type parameter
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Load CSV dictionary
csv_path = hf_hub_download(repo_id="ghananlpcommunity/nkrane_ct2", filename="translation_dict.csv")
df = pd.read_csv(csv_path)
noun_dict = dict(zip(df['english'].str.lower().str.strip(), df['twi'].str.strip()))
def remove_stopwords(phrase):
"""Remove stopwords from a phrase."""
doc = nlp(phrase) if isinstance(phrase, str) else phrase
cleaned = [token.text for token in doc if token.text.lower() not in STOPWORDS]
return ' '.join(cleaned).strip()
def split_into_batches(text, max_chars=600):
"""Split long text at sentence boundaries - simplified version."""
if len(text) <= max_chars:
return [text]
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
batches = []
current = ""
for sentence in sentences:
# If adding this sentence would exceed max_chars, start new batch
if current and (len(current) + len(sentence) + 1 > max_chars):
batches.append(current.strip())
current = sentence
else:
if current:
current += " " + sentence
else:
current = sentence
# Add the last batch
if current:
batches.append(current.strip())
return batches
def extract_noun_phrases(text):
"""Extract noun phrases from text."""
doc = nlp(text)
noun_phrases = []
# Extract noun chunks
for chunk in doc.noun_chunks:
noun_phrases.append({
'text': chunk.text,
'start': chunk.start_char,
'end': chunk.end_char
})
# Add standalone proper nouns
covered = set()
for np in noun_phrases:
for i in range(np['start'], np['end']):
covered.add(i)
for token in doc:
if token.pos_ == "PROPN" and token.idx not in covered:
noun_phrases.append({
'text': token.text,
'start': token.idx,
'end': token.idx + len(token.text)
})
# Remove overlaps (keep longer phrases)
noun_phrases.sort(key=lambda x: (x['start'], -(x['end'] - x['start'])))
filtered = []
last_end = -1
for np in noun_phrases:
if np['start'] >= last_end:
filtered.append(np)
last_end = np['end']
return filtered
def filter_dict_phrases(noun_phrases, dictionary):
"""Filter phrases to only those in dictionary."""
filtered = []
for phrase in noun_phrases:
phrase_lower = phrase['text'].lower().strip()
cleaned = remove_stopwords(phrase['text']).lower().strip()
if phrase_lower in dictionary or (cleaned and cleaned in dictionary):
filtered.append(phrase)
return filtered
def create_placeholders(text, noun_phrases):
"""Replace content words in noun phrases with placeholders."""
if not noun_phrases:
return text, {}
sorted_phrases = sorted(noun_phrases, key=lambda x: x['start'])
phrase_mapping = {}
result_parts = []
last_end = 0
for idx, phrase in enumerate(sorted_phrases, 1):
placeholder = f"<{idx}>"
phrase_text = phrase['text']
phrase_doc = nlp(phrase_text)
# Split into leading stopwords, content, trailing stopwords
leading_stop = []
content = []
trailing_stop = []
content_started = False
content_ended = False
for token in phrase_doc:
is_stop = token.text.lower() in STOPWORDS
if not content_started and is_stop:
leading_stop.append(token.text_with_ws)
elif not content_ended:
content_started = True
if is_stop:
remaining = phrase_doc[token.i + 1:]
if any(t.text.lower() not in STOPWORDS for t in remaining):
content.append(token.text_with_ws)
else:
content_ended = True
trailing_stop.append(token.text_with_ws)
else:
content.append(token.text_with_ws)
else:
trailing_stop.append(token.text_with_ws)
content_text = ''.join(content).strip()
phrase_mapping[placeholder] = content_text
replacement = ''.join(leading_stop) + placeholder + ''.join(trailing_stop)
result_parts.append(text[last_end:phrase['start']])
result_parts.append(replacement)
last_end = phrase['end']
result_parts.append(text[last_end:])
return ''.join(result_parts), phrase_mapping
def lookup_phrases(phrase_mapping, dictionary):
"""Look up phrases in dictionary."""
translation_mapping = {}
for placeholder, phrase_text in phrase_mapping.items():
phrase_lower = phrase_text.lower().strip()
if phrase_lower in dictionary:
translation_mapping[placeholder] = dictionary[phrase_lower]
else:
cleaned = remove_stopwords(phrase_text).lower().strip()
if cleaned and cleaned in dictionary:
translation_mapping[placeholder] = dictionary[cleaned]
else:
translation_mapping[placeholder] = phrase_text # Keep original if no match
return translation_mapping
def translate_with_model(text):
"""Translate text using CTranslate2 model - simplified version."""
tokenizer.src_lang = "eng_Latn"
input_ids = tokenizer.encode(text, add_special_tokens=True)
source = tokenizer.convert_ids_to_tokens(input_ids)
# Simplified translation parameters (matching working app)
results = model.translate_batch(
[source],
beam_size=5,
max_decoding_length=512
)
output_tokens = results[0].hypotheses[0]
translation = tokenizer.decode(
tokenizer.convert_tokens_to_ids(output_tokens),
skip_special_tokens=True
)
return translation
def replace_placeholders(translated_text, translation_mapping):
"""Replace placeholders with Twi translations."""
result = translated_text
sorted_items = sorted(
translation_mapping.items(),
key=lambda x: int(re.search(r'\d+', x[0]).group())
)
for placeholder, twi_phrase in sorted_items:
result = result.replace(placeholder, twi_phrase)
return result
def translate_hybrid(text):
"""Main hybrid translation function with batching."""
if not text.strip():
return "Please enter text to translate."
# Split into batches if needed
batches = split_into_batches(text)
translations = []
for i, batch in enumerate(batches):
print(f"Processing batch {i+1}/{len(batches)}: {batch[:50]}...")
# Extract noun phrases
all_phrases = extract_noun_phrases(batch)
noun_phrases = filter_dict_phrases(all_phrases, noun_dict)
# Create placeholders
placeholder_text, phrase_mapping = create_placeholders(batch, noun_phrases)
# Look up dictionary translations
translation_mapping = lookup_phrases(phrase_mapping, noun_dict)
# Translate with model
model_output = translate_with_model(placeholder_text)
# Replace placeholders with Twi
final_translation = replace_placeholders(model_output, translation_mapping)
translations.append(final_translation)
return " ".join(translations)
# Test the updated code
text = "Joy FM is a privately owned radio station in Accra, the capital of Ghana. The station is owned and run by the media group company Multimedia Group Limited. It is arguably the leading radio station in Ghana that broadcasts in the English language."
print("Input text:")
print(text)
print("\nHybrid translation:")
result = translate_hybrid(text)
print(result)
GPU Support
For faster inference on GPU:
model = ctranslate2.Translator(model_path, device="cuda", compute_type="int8")
License
Apache 2.0
Citation
@misc{nkrane_2025,
title={Nkrane: English-to-Twi Neural Machine Translation},
author={Ghana NLP Community},
year={2025},
howpublished={https://huggingface.co/ghananlpcommunity/nkrane}
}
- Downloads last month
- 53