wmoto-ai's picture
Add mode switching: chunk mode vs overlap mode for seamless transcription
450172b verified
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Moonshine Tiny JA - リアルタイム日本語文字起こし</title>
<meta name="description" content="ブラウザ上で動作する日本語音声認識デモ。Moonshine Tiny JAモデルをTransformers.jsで実行。">
<style>
* {
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background: #1a1a2e;
color: #eee;
min-height: 100vh;
}
h1 {
text-align: center;
color: #00d4ff;
margin-bottom: 5px;
}
.subtitle {
text-align: center;
color: #888;
font-size: 14px;
margin-bottom: 20px;
}
.status {
text-align: center;
padding: 10px;
border-radius: 8px;
margin: 20px 0;
background: #16213e;
}
.status.loading {
color: #ffa500;
}
.status.ready {
color: #00ff88;
}
.status.recording {
color: #ff4757;
}
.status.error {
color: #ff4757;
background: #2d1f1f;
}
button {
display: block;
width: 200px;
margin: 20px auto;
padding: 15px 30px;
font-size: 18px;
border: none;
border-radius: 50px;
cursor: pointer;
transition: all 0.3s;
}
button:disabled {
background: #555;
cursor: not-allowed;
}
#startBtn {
background: linear-gradient(135deg, #00d4ff, #00ff88);
color: #1a1a2e;
font-weight: bold;
}
#startBtn:hover:not(:disabled) {
transform: scale(1.05);
box-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
}
#startBtn.recording {
background: linear-gradient(135deg, #ff4757, #ff6b81);
animation: pulse 1s infinite;
}
@keyframes pulse {
0%, 100% {
box-shadow: 0 0 0 0 rgba(255, 71, 87, 0.4);
}
50% {
box-shadow: 0 0 0 15px rgba(255, 71, 87, 0);
}
}
.control-container {
margin: 20px 0;
padding: 15px;
background: #16213e;
border-radius: 8px;
}
.control-container label {
display: block;
margin-bottom: 10px;
}
.control-container input[type="range"] {
width: 100%;
cursor: pointer;
}
.slider-labels {
display: flex;
justify-content: space-between;
font-size: 12px;
color: #888;
margin-top: 5px;
}
.mode-switch {
display: flex;
gap: 10px;
margin-bottom: 15px;
}
.mode-switch button {
flex: 1;
width: auto;
margin: 0;
padding: 10px 15px;
font-size: 14px;
border-radius: 8px;
background: #0f0f23;
color: #888;
}
.mode-switch button.active {
background: linear-gradient(135deg, #00d4ff, #00ff88);
color: #1a1a2e;
font-weight: bold;
}
.mode-switch button:disabled {
opacity: 0.5;
}
.mode-description {
font-size: 12px;
color: #888;
margin-top: 10px;
padding: 10px;
background: #0f0f23;
border-radius: 6px;
}
#transcript {
background: #16213e;
border-radius: 12px;
padding: 20px;
min-height: 200px;
margin-top: 20px;
font-size: 18px;
line-height: 1.8;
white-space: pre-wrap;
word-wrap: break-word;
}
#transcript:empty::before {
content: "文字起こし結果がここに表示されます...";
color: #666;
}
#currentText {
color: #00d4ff;
font-style: italic;
min-height: 30px;
margin-top: 10px;
text-align: center;
}
.info {
background: #16213e;
border-radius: 8px;
padding: 15px;
margin: 20px 0;
font-size: 14px;
color: #aaa;
}
.info a {
color: #00d4ff;
text-decoration: none;
}
.info a:hover {
text-decoration: underline;
}
.progress-container {
background: #0f0f23;
border-radius: 10px;
height: 20px;
margin: 10px 0;
overflow: hidden;
}
.progress-bar {
height: 100%;
background: linear-gradient(90deg, #00d4ff, #00ff88);
width: 0%;
transition: width 0.3s;
}
.footer {
text-align: center;
margin-top: 30px;
padding-top: 20px;
border-top: 1px solid #333;
font-size: 12px;
color: #666;
}
.footer a {
color: #00d4ff;
text-decoration: none;
}
</style>
</head>
<body>
<h1>Moonshine Tiny JA</h1>
<p class="subtitle">ブラウザで動作する日本語リアルタイム文字起こし</p>
<div id="status" class="status loading">
モデルを読み込み中...
<div class="progress-container">
<div id="progressBar" class="progress-bar"></div>
</div>
</div>
<button id="startBtn" disabled>読み込み中...</button>
<div class="control-container">
<div class="mode-switch">
<button id="modeChunk" class="active">区切りモード</button>
<button id="modeOverlap">オーバーラップモード</button>
</div>
<div id="modeDescription" class="mode-description">
指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。
</div>
</div>
<div class="control-container">
<label for="intervalSlider">
<span id="intervalLabel">録音間隔</span>: <span id="intervalValue">3</span>
</label>
<input type="range" id="intervalSlider" min="1" max="6" step="0.5" value="3">
<div class="slider-labels">
<span>1秒 (高速)</span>
<span>6秒 (高精度)</span>
</div>
</div>
<div id="currentText"></div>
<div id="transcript"></div>
<div class="info">
<strong>使い方:</strong><br>
1. モデルの読み込みを待つ(初回は数分かかります)<br>
2. モードを選択(オーバーラップ推奨)<br>
3. 「録音開始」ボタンをクリック<br>
4. マイクに向かって話す<br><br>
<strong>モデル:</strong> <a href="https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX" target="_blank">wmoto-ai/moonshine-tiny-ja-ONNX</a><br>
<strong>ベース:</strong> <a href="https://huggingface.co/UsefulSensors/moonshine-tiny-ja" target="_blank">UsefulSensors/moonshine-tiny-ja</a>
</div>
<div class="footer">
Powered by <a href="https://www.moonshine.ai/" target="_blank">Moonshine AI</a> |
<a href="https://huggingface.co/docs/transformers.js" target="_blank">Transformers.js</a><br>
Licensed under <a href="https://github.com/usefulsensors/moonshine/blob/main/LICENSE" target="_blank">Moonshine AI Community License</a>
</div>
<script type="module">
import {
MoonshineForConditionalGeneration,
AutoProcessor,
AutoTokenizer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3';
const statusEl = document.getElementById('status');
const startBtn = document.getElementById('startBtn');
const transcriptEl = document.getElementById('transcript');
const currentTextEl = document.getElementById('currentText');
const progressBar = document.getElementById('progressBar');
const intervalSlider = document.getElementById('intervalSlider');
const intervalValue = document.getElementById('intervalValue');
const intervalLabel = document.getElementById('intervalLabel');
const modeChunkBtn = document.getElementById('modeChunk');
const modeOverlapBtn = document.getElementById('modeOverlap');
const modeDescription = document.getElementById('modeDescription');
let model = null;
let processor = null;
let tokenizer = null;
let isRecording = false;
let isProcessing = false;
let currentMode = 'chunk';
// Chunk mode variables
let mediaRecorder = null;
let audioChunks = [];
// Overlap mode variables
let audioContext = null;
let mediaStream = null;
let scriptProcessor = null;
let audioBuffer = [];
let processTimer = null;
let lastTranscript = '';
const SAMPLE_RATE = 16000;
const WINDOW_SEC = 4;
const MODEL_ID = 'wmoto-ai/moonshine-tiny-ja-ONNX';
// Mode switching
function setMode(mode) {
if (isRecording) return;
currentMode = mode;
if (mode === 'chunk') {
modeChunkBtn.classList.add('active');
modeOverlapBtn.classList.remove('active');
modeDescription.textContent = '指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。';
intervalLabel.textContent = '録音間隔';
intervalSlider.min = '1';
intervalSlider.max = '6';
intervalSlider.value = '3';
intervalValue.textContent = '3';
} else {
modeChunkBtn.classList.remove('active');
modeOverlapBtn.classList.add('active');
modeDescription.textContent = '連続バッファ + オーバーラップ処理。境界での途切れを防ぎ、滑らかな文字起こしを実現。';
intervalLabel.textContent = '処理間隔';
intervalSlider.min = '1';
intervalSlider.max = '4';
intervalSlider.value = '2';
intervalValue.textContent = '2';
}
}
modeChunkBtn.addEventListener('click', () => setMode('chunk'));
modeOverlapBtn.addEventListener('click', () => setMode('overlap'));
intervalSlider.addEventListener('input', () => {
intervalValue.textContent = intervalSlider.value;
});
async function loadModel() {
try {
statusEl.textContent = 'モデルを読み込み中... (初回は数分かかることがあります)';
const progressCallback = (progress) => {
if (progress.status === 'progress') {
const percent = Math.round((progress.loaded / progress.total) * 100);
progressBar.style.width = percent + '%';
statusEl.textContent = `モデルを読み込み中... ${percent}%`;
}
};
[model, processor, tokenizer] = await Promise.all([
MoonshineForConditionalGeneration.from_pretrained(MODEL_ID, {
dtype: 'fp32',
progress_callback: progressCallback
}),
AutoProcessor.from_pretrained(MODEL_ID),
AutoTokenizer.from_pretrained(MODEL_ID)
]);
statusEl.textContent = '準備完了!録音を開始できます';
statusEl.className = 'status ready';
startBtn.textContent = '録音開始';
startBtn.disabled = false;
} catch (error) {
statusEl.textContent = `エラー: ${error.message}`;
statusEl.className = 'status error';
}
}
// ============ Common transcription function ============
async function transcribe(audioData) {
if (!model || !processor || !tokenizer) return null;
if (audioData.length < 1600) return null;
let maxLevel = 0;
let sumSquares = 0;
for (let i = 0; i < audioData.length; i++) {
const abs = Math.abs(audioData[i]);
if (abs > maxLevel) maxLevel = abs;
sumSquares += audioData[i] * audioData[i];
}
const rms = Math.sqrt(sumSquares / audioData.length);
if (rms < 0.01 || maxLevel < 0.05) return null;
const inputs = await processor(audioData);
const audioDuration = audioData.length / SAMPLE_RATE;
const maxTokens = Math.min(Math.round(audioDuration * 25), 150);
const outputs = await model.generate({
...inputs,
max_new_tokens: maxTokens,
});
let text = tokenizer.decode(outputs[0], { skip_special_tokens: true }).trim();
const repeatPattern = /(.{2,}?)\1{4,}/;
if (repeatPattern.test(text)) {
text = text.replace(/(.{2,}?)\1{3,}/g, '$1');
}
const hallucinations = ['彼は私', '彼女は私', 'そう、そう'];
const isHallucination = hallucinations.some(h => text.includes(h) && text.length > 30);
if (isHallucination) return null;
return text;
}
// ============ Chunk Mode ============
async function transcribeAudioBlob(audioBlob) {
try {
currentTextEl.textContent = '処理中...';
const arrayBuffer = await audioBlob.arrayBuffer();
if (!audioContext) {
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
}
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
const offlineCtx = new OfflineAudioContext(
1,
Math.ceil(audioBuffer.duration * SAMPLE_RATE),
SAMPLE_RATE
);
const source = offlineCtx.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineCtx.destination);
source.start();
const resampled = await offlineCtx.startRendering();
const audioData = resampled.getChannelData(0);
const text = await transcribe(audioData);
if (text) {
currentTextEl.textContent = text;
transcriptEl.textContent += text + '\n';
} else {
currentTextEl.textContent = '(音声が検出されませんでした)';
}
} catch (error) {
currentTextEl.textContent = `エラー: ${error.message}`;
}
}
async function startChunkRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { channelCount: 1, sampleRate: SAMPLE_RATE }
});
audioChunks = [];
mediaRecorder = new MediaRecorder(stream, {
mimeType: 'audio/webm;codecs=opus'
});
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data);
}
};
const processAndRestart = async () => {
if (!isRecording) return;
mediaRecorder.stop();
};
mediaRecorder.onstop = async () => {
if (audioChunks.length > 0 && isRecording) {
const audioBlob = new Blob(audioChunks, { type: 'audio/webm;codecs=opus' });
audioChunks = [];
await transcribeAudioBlob(audioBlob);
if (isRecording && mediaRecorder.stream.active) {
const intervalMs = parseFloat(intervalSlider.value) * 1000;
mediaRecorder.start(500);
setTimeout(processAndRestart, intervalMs);
}
}
};
mediaRecorder.start(500);
isRecording = true;
const intervalMs = parseFloat(intervalSlider.value) * 1000;
setTimeout(processAndRestart, intervalMs);
updateRecordingUI();
} catch (error) {
statusEl.textContent = `マイクエラー: ${error.message}`;
statusEl.className = 'status error';
}
}
function stopChunkRecording() {
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop();
mediaRecorder.stream.getTracks().forEach(track => track.stop());
}
isRecording = false;
audioChunks = [];
}
// ============ Overlap Mode ============
function removeDuplicateText(prevText, newText) {
if (!prevText || !newText) return newText;
const minOverlap = 2;
const maxOverlap = Math.min(prevText.length, newText.length, 20);
for (let len = maxOverlap; len >= minOverlap; len--) {
const prevEnd = prevText.slice(-len);
if (newText.startsWith(prevEnd)) {
return newText.slice(len);
}
}
return newText;
}
async function processAudioWindow() {
if (!model || !processor || !tokenizer || !isRecording) return;
if (isProcessing) return;
isProcessing = true;
try {
const windowSamples = WINDOW_SEC * SAMPLE_RATE;
if (audioBuffer.length < windowSamples * 0.5) {
currentTextEl.textContent = '(音声を収集中...)';
return;
}
const startIdx = Math.max(0, audioBuffer.length - windowSamples);
const audioData = new Float32Array(audioBuffer.slice(startIdx));
currentTextEl.textContent = '処理中...';
const text = await transcribe(audioData);
if (text) {
const uniqueText = removeDuplicateText(lastTranscript, text);
if (uniqueText && uniqueText.length > 0) {
currentTextEl.textContent = text;
transcriptEl.textContent += uniqueText;
lastTranscript = text;
}
} else {
currentTextEl.textContent = '(音声が検出されませんでした)';
}
} catch (error) {
currentTextEl.textContent = `エラー: ${error.message}`;
} finally {
isProcessing = false;
}
}
async function startOverlapRecording() {
try {
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: { channelCount: 1, sampleRate: SAMPLE_RATE }
});
audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: SAMPLE_RATE
});
const source = audioContext.createMediaStreamSource(mediaStream);
const bufferSize = 4096;
scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
const maxBufferSize = SAMPLE_RATE * 10;
scriptProcessor.onaudioprocess = (e) => {
if (!isRecording) return;
const inputData = e.inputBuffer.getChannelData(0);
for (let i = 0; i < inputData.length; i++) {
audioBuffer.push(inputData[i]);
}
while (audioBuffer.length > maxBufferSize) {
audioBuffer.shift();
}
};
source.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
audioBuffer = [];
lastTranscript = '';
isRecording = true;
isProcessing = false;
const intervalMs = parseFloat(intervalSlider.value) * 1000;
processTimer = setInterval(processAudioWindow, intervalMs);
updateRecordingUI();
} catch (error) {
statusEl.textContent = `マイクエラー: ${error.message}`;
statusEl.className = 'status error';
}
}
function stopOverlapRecording() {
isRecording = false;
if (processTimer) {
clearInterval(processTimer);
processTimer = null;
}
if (scriptProcessor) {
scriptProcessor.disconnect();
scriptProcessor = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
audioBuffer = [];
}
// ============ UI Helpers ============
function updateRecordingUI() {
statusEl.textContent = '録音中... マイクに向かって話してください';
statusEl.className = 'status recording';
startBtn.textContent = '録音停止';
startBtn.classList.add('recording');
modeChunkBtn.disabled = true;
modeOverlapBtn.disabled = true;
}
function updateStoppedUI() {
statusEl.textContent = '録音停止。再開するにはボタンをクリック';
statusEl.className = 'status ready';
startBtn.textContent = '録音開始';
startBtn.classList.remove('recording');
currentTextEl.textContent = '';
modeChunkBtn.disabled = false;
modeOverlapBtn.disabled = false;
}
// ============ Main Controls ============
function startRecording() {
if (currentMode === 'chunk') {
startChunkRecording();
} else {
startOverlapRecording();
}
}
function stopRecording() {
if (currentMode === 'chunk') {
stopChunkRecording();
} else {
stopOverlapRecording();
}
updateStoppedUI();
}
startBtn.addEventListener('click', () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
loadModel();
</script>
</body>
</html>