Spaces:

wmoto-ai
/

moonshine-tiny-ja-demo

Running

App Files Files Community

wmoto-ai commited on 12 days ago

Commit

450172b

verified ·

1 Parent(s): cb5e942

Add mode switching: chunk mode vs overlap mode for seamless transcription

Browse files

Files changed (1) hide show

index.html +320 -73

index.html CHANGED Viewed

@@ -101,19 +101,19 @@
       }
     }
-    .slider-container {
       margin: 20px 0;
       padding: 15px;
       background: #16213e;
       border-radius: 8px;
     }
-    .slider-container label {
       display: block;
       margin-bottom: 10px;
     }
-    .slider-container input[type="range"] {
       width: 100%;
       cursor: pointer;
     }
@@ -126,6 +126,42 @@
       margin-top: 5px;
     }
     #transcript {
       background: #16213e;
       border-radius: 12px;
@@ -213,9 +249,19 @@
   <button id="startBtn" disabled>読み込み中...</button>
-  <div class="slider-container">
     <label for="intervalSlider">
-      録音間隔: <span id="intervalValue">3</span>秒
     </label>
     <input type="range" id="intervalSlider" min="1" max="6" step="0.5" value="3">
     <div class="slider-labels">
@@ -230,9 +276,9 @@
   <div class="info">
     <strong>使い方:</strong><br>
     1. モデルの読み込みを待つ（初回は数分かかります）<br>
-    2. 「録音開始」ボタンをクリック<br>
-    3. マイクに向かって話す<br>
-    4. 指定間隔ごとに文字起こし結果が表示されます<br><br>
     <strong>モデル:</strong> <a href="https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX" target="_blank">wmoto-ai/moonshine-tiny-ja-ONNX</a><br>
     <strong>ベース:</strong> <a href="https://huggingface.co/UsefulSensors/moonshine-tiny-ja" target="_blank">UsefulSensors/moonshine-tiny-ja</a>
   </div>
@@ -257,21 +303,67 @@
     const progressBar = document.getElementById('progressBar');
     const intervalSlider = document.getElementById('intervalSlider');
     const intervalValue = document.getElementById('intervalValue');
-    intervalSlider.addEventListener('input', () => {
-      intervalValue.textContent = intervalSlider.value;
-    });
     let model = null;
     let processor = null;
     let tokenizer = null;
     let isRecording = false;
     let mediaRecorder = null;
-    let audioContext = null;
     let audioChunks = [];
     const MODEL_ID = 'wmoto-ai/moonshine-tiny-ja-ONNX';
     async function loadModel() {
       try {
         statusEl.textContent = 'モデルを読み込み中... (初回は数分かかることがあります)';
@@ -298,31 +390,69 @@
         startBtn.textContent = '録音開始';
         startBtn.disabled = false;
       } catch (error) {
-        console.error('Model loading error:', error);
         statusEl.textContent = `エラー: ${error.message}`;
         statusEl.className = 'status error';
       }
     }
-    async function transcribeAudio(audioBlob) {
-      if (!model || !processor || !tokenizer) return;
       try {
         currentTextEl.textContent = '処理中...';
         const arrayBuffer = await audioBlob.arrayBuffer();
         if (!audioContext) {
-          audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
         }
         const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
-        const targetSampleRate = 16000;
         const offlineCtx = new OfflineAudioContext(
           1,
-          Math.ceil(audioBuffer.duration * targetSampleRate),
-          targetSampleRate
         );
         const source = offlineCtx.createBufferSource();
@@ -333,65 +463,23 @@
         const resampled = await offlineCtx.startRendering();
         const audioData = resampled.getChannelData(0);
-        if (audioData.length < 1600) {
-          currentTextEl.textContent = '(音声が短すぎます)';
-          return;
-        }
-        let maxLevel = 0;
-        let sumSquares = 0;
-        for (let i = 0; i < audioData.length; i++) {
-          const abs = Math.abs(audioData[i]);
-          if (abs > maxLevel) maxLevel = abs;
-          sumSquares += audioData[i] * audioData[i];
-        }
-        const rms = Math.sqrt(sumSquares / audioData.length);
-        if (rms < 0.01 || maxLevel < 0.05) {
-          currentTextEl.textContent = '(音声が小さすぎます)';
-          return;
-        }
-        const inputs = await processor(audioData);
-        const intervalSec = parseFloat(intervalSlider.value);
-        const maxTokens = Math.min(Math.round(intervalSec * 25), 150);
-        const outputs = await model.generate({
-          ...inputs,
-          max_new_tokens: maxTokens,
-        });
-        let text = tokenizer.decode(outputs[0], { skip_special_tokens: true }).trim();
-        const repeatPattern = /(.{2,}?)\1{4,}/;
-        if (repeatPattern.test(text)) {
-          text = text.replace(/(.{2,}?)\1{3,}/g, '$1');
-        }
-        const hallucinations = ['彼は私', '彼女は私', 'そう、そう'];
-        const isHallucination = hallucinations.some(h => text.includes(h) && text.length > 30);
-        if (text && !isHallucination) {
           currentTextEl.textContent = text;
           transcriptEl.textContent += text + '\n';
-        } else if (isHallucination) {
-          currentTextEl.textContent = '(ノイズ検出)';
         } else {
           currentTextEl.textContent = '(音声が検出されませんでした)';
         }
       } catch (error) {
-        console.error('Transcription error:', error);
         currentTextEl.textContent = `エラー: ${error.message}`;
       }
     }
-    async function startRecording() {
       try {
         const stream = await navigator.mediaDevices.getUserMedia({
-          audio: {
-            channelCount: 1,
-            sampleRate: 16000,
-          }
         });
         audioChunks = [];
@@ -416,7 +504,7 @@
             const audioBlob = new Blob(audioChunks, { type: 'audio/webm;codecs=opus' });
             audioChunks = [];
-            await transcribeAudio(audioBlob);
             if (isRecording && mediaRecorder.stream.active) {
               const intervalMs = parseFloat(intervalSlider.value) * 1000;
@@ -432,30 +520,189 @@
         const intervalMs = parseFloat(intervalSlider.value) * 1000;
         setTimeout(processAndRestart, intervalMs);
-        statusEl.textContent = '録音中... マイクに向かって話してください';
-        statusEl.className = 'status recording';
-        startBtn.textContent = '録音停止';
-        startBtn.classList.add('recording');
       } catch (error) {
-        console.error('Recording error:', error);
         statusEl.textContent = `マイクエラー: ${error.message}`;
         statusEl.className = 'status error';
       }
     }
-    function stopRecording() {
       if (mediaRecorder && mediaRecorder.state !== 'inactive') {
         mediaRecorder.stop();
         mediaRecorder.stream.getTracks().forEach(track => track.stop());
       }
       isRecording = false;
       audioChunks = [];
       statusEl.textContent = '録音停止。再開するにはボタンをクリック';
       statusEl.className = 'status ready';
       startBtn.textContent = '録音開始';
       startBtn.classList.remove('recording');
       currentTextEl.textContent = '';
     }
     startBtn.addEventListener('click', () => {

       }
     }
+    .control-container {
       margin: 20px 0;
       padding: 15px;
       background: #16213e;
       border-radius: 8px;
     }
+    .control-container label {
       display: block;
       margin-bottom: 10px;
     }
+    .control-container input[type="range"] {
       width: 100%;
       cursor: pointer;
     }
       margin-top: 5px;
     }
+    .mode-switch {
+      display: flex;
+      gap: 10px;
+      margin-bottom: 15px;
+    }
+    .mode-switch button {
+      flex: 1;
+      width: auto;
+      margin: 0;
+      padding: 10px 15px;
+      font-size: 14px;
+      border-radius: 8px;
+      background: #0f0f23;
+      color: #888;
+    }
+    .mode-switch button.active {
+      background: linear-gradient(135deg, #00d4ff, #00ff88);
+      color: #1a1a2e;
+      font-weight: bold;
+    }
+    .mode-switch button:disabled {
+      opacity: 0.5;
+    }
+    .mode-description {
+      font-size: 12px;
+      color: #888;
+      margin-top: 10px;
+      padding: 10px;
+      background: #0f0f23;
+      border-radius: 6px;
+    }
     #transcript {
       background: #16213e;
       border-radius: 12px;
   <button id="startBtn" disabled>読み込み中...</button>
+  <div class="control-container">
+    <div class="mode-switch">
+      <button id="modeChunk" class="active">区切りモード</button>
+      <button id="modeOverlap">オーバーラップモード</button>
+    </div>
+    <div id="modeDescription" class="mode-description">
+      指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。
+    </div>
+  </div>
+  <div class="control-container">
     <label for="intervalSlider">
+      <span id="intervalLabel">録音間隔</span>: <span id="intervalValue">3</span>秒
     </label>
     <input type="range" id="intervalSlider" min="1" max="6" step="0.5" value="3">
     <div class="slider-labels">
   <div class="info">
     <strong>使い方:</strong><br>
     1. モデルの読み込みを待つ（初回は数分かかります）<br>
+    2. モードを選択（オーバーラップ推奨）<br>
+    3. 「録音開始」ボタンをクリック<br>
+    4. マイクに向かって話す<br><br>
     <strong>モデル:</strong> <a href="https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX" target="_blank">wmoto-ai/moonshine-tiny-ja-ONNX</a><br>
     <strong>ベース:</strong> <a href="https://huggingface.co/UsefulSensors/moonshine-tiny-ja" target="_blank">UsefulSensors/moonshine-tiny-ja</a>
   </div>
     const progressBar = document.getElementById('progressBar');
     const intervalSlider = document.getElementById('intervalSlider');
     const intervalValue = document.getElementById('intervalValue');
+    const intervalLabel = document.getElementById('intervalLabel');
+    const modeChunkBtn = document.getElementById('modeChunk');
+    const modeOverlapBtn = document.getElementById('modeOverlap');
+    const modeDescription = document.getElementById('modeDescription');
     let model = null;
     let processor = null;
     let tokenizer = null;
     let isRecording = false;
+    let isProcessing = false;
+    let currentMode = 'chunk';
+    // Chunk mode variables
     let mediaRecorder = null;
     let audioChunks = [];
+    // Overlap mode variables
+    let audioContext = null;
+    let mediaStream = null;
+    let scriptProcessor = null;
+    let audioBuffer = [];
+    let processTimer = null;
+    let lastTranscript = '';
+    const SAMPLE_RATE = 16000;
+    const WINDOW_SEC = 4;
     const MODEL_ID = 'wmoto-ai/moonshine-tiny-ja-ONNX';
+    // Mode switching
+    function setMode(mode) {
+      if (isRecording) return;
+      currentMode = mode;
+      if (mode === 'chunk') {
+        modeChunkBtn.classList.add('active');
+        modeOverlapBtn.classList.remove('active');
+        modeDescription.textContent = '指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。';
+        intervalLabel.textContent = '録音間隔';
+        intervalSlider.min = '1';
+        intervalSlider.max = '6';
+        intervalSlider.value = '3';
+        intervalValue.textContent = '3';
+      } else {
+        modeChunkBtn.classList.remove('active');
+        modeOverlapBtn.classList.add('active');
+        modeDescription.textContent = '連続バッファ + オーバーラップ処理。境界での途切れを防ぎ、滑らかな文字起こしを実現。';
+        intervalLabel.textContent = '処理間隔';
+        intervalSlider.min = '1';
+        intervalSlider.max = '4';
+        intervalSlider.value = '2';
+        intervalValue.textContent = '2';
+      }
+    }
+    modeChunkBtn.addEventListener('click', () => setMode('chunk'));
+    modeOverlapBtn.addEventListener('click', () => setMode('overlap'));
+    intervalSlider.addEventListener('input', () => {
+      intervalValue.textContent = intervalSlider.value;
+    });
     async function loadModel() {
       try {
         statusEl.textContent = 'モデルを読み込み中... (初回は数分かかることがあります)';
         startBtn.textContent = '録音開始';
         startBtn.disabled = false;
       } catch (error) {
         statusEl.textContent = `エラー: ${error.message}`;
         statusEl.className = 'status error';
       }
     }
+    // ============ Common transcription function ============
+    async function transcribe(audioData) {
+      if (!model || !processor || !tokenizer) return null;
+      if (audioData.length < 1600) return null;
+      let maxLevel = 0;
+      let sumSquares = 0;
+      for (let i = 0; i < audioData.length; i++) {
+        const abs = Math.abs(audioData[i]);
+        if (abs > maxLevel) maxLevel = abs;
+        sumSquares += audioData[i] * audioData[i];
+      }
+      const rms = Math.sqrt(sumSquares / audioData.length);
+      if (rms < 0.01 || maxLevel < 0.05) return null;
+      const inputs = await processor(audioData);
+      const audioDuration = audioData.length / SAMPLE_RATE;
+      const maxTokens = Math.min(Math.round(audioDuration * 25), 150);
+      const outputs = await model.generate({
+        ...inputs,
+        max_new_tokens: maxTokens,
+      });
+      let text = tokenizer.decode(outputs[0], { skip_special_tokens: true }).trim();
+      const repeatPattern = /(.{2,}?)\1{4,}/;
+      if (repeatPattern.test(text)) {
+        text = text.replace(/(.{2,}?)\1{3,}/g, '$1');
+      }
+      const hallucinations = ['彼は私', '彼女は私', 'そう、そう'];
+      const isHallucination = hallucinations.some(h => text.includes(h) && text.length > 30);
+      if (isHallucination) return null;
+      return text;
+    }
+    // ============ Chunk Mode ============
+    async function transcribeAudioBlob(audioBlob) {
       try {
         currentTextEl.textContent = '処理中...';
         const arrayBuffer = await audioBlob.arrayBuffer();
         if (!audioContext) {
+          audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
         }
         const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
         const offlineCtx = new OfflineAudioContext(
           1,
+          Math.ceil(audioBuffer.duration * SAMPLE_RATE),
+          SAMPLE_RATE
         );
         const source = offlineCtx.createBufferSource();
         const resampled = await offlineCtx.startRendering();
         const audioData = resampled.getChannelData(0);
+        const text = await transcribe(audioData);
+        if (text) {
           currentTextEl.textContent = text;
           transcriptEl.textContent += text + '\n';
         } else {
           currentTextEl.textContent = '(音声が検出されませんでした)';
         }
       } catch (error) {
         currentTextEl.textContent = `エラー: ${error.message}`;
       }
     }
+    async function startChunkRecording() {
       try {
         const stream = await navigator.mediaDevices.getUserMedia({
+          audio: { channelCount: 1, sampleRate: SAMPLE_RATE }
         });
         audioChunks = [];
             const audioBlob = new Blob(audioChunks, { type: 'audio/webm;codecs=opus' });
             audioChunks = [];
+            await transcribeAudioBlob(audioBlob);
             if (isRecording && mediaRecorder.stream.active) {
               const intervalMs = parseFloat(intervalSlider.value) * 1000;
         const intervalMs = parseFloat(intervalSlider.value) * 1000;
         setTimeout(processAndRestart, intervalMs);
+        updateRecordingUI();
       } catch (error) {
         statusEl.textContent = `マイクエラー: ${error.message}`;
         statusEl.className = 'status error';
       }
     }
+    function stopChunkRecording() {
       if (mediaRecorder && mediaRecorder.state !== 'inactive') {
         mediaRecorder.stop();
         mediaRecorder.stream.getTracks().forEach(track => track.stop());
       }
       isRecording = false;
       audioChunks = [];
+    }
+    // ============ Overlap Mode ============
+    function removeDuplicateText(prevText, newText) {
+      if (!prevText || !newText) return newText;
+      const minOverlap = 2;
+      const maxOverlap = Math.min(prevText.length, newText.length, 20);
+      for (let len = maxOverlap; len >= minOverlap; len--) {
+        const prevEnd = prevText.slice(-len);
+        if (newText.startsWith(prevEnd)) {
+          return newText.slice(len);
+        }
+      }
+      return newText;
+    }
+    async function processAudioWindow() {
+      if (!model || !processor || !tokenizer || !isRecording) return;
+      if (isProcessing) return;
+      isProcessing = true;
+      try {
+        const windowSamples = WINDOW_SEC * SAMPLE_RATE;
+        if (audioBuffer.length < windowSamples * 0.5) {
+          currentTextEl.textContent = '(音声を収集中...)';
+          return;
+        }
+        const startIdx = Math.max(0, audioBuffer.length - windowSamples);
+        const audioData = new Float32Array(audioBuffer.slice(startIdx));
+        currentTextEl.textContent = '処理中...';
+        const text = await transcribe(audioData);
+        if (text) {
+          const uniqueText = removeDuplicateText(lastTranscript, text);
+          if (uniqueText && uniqueText.length > 0) {
+            currentTextEl.textContent = text;
+            transcriptEl.textContent += uniqueText;
+            lastTranscript = text;
+          }
+        } else {
+          currentTextEl.textContent = '(音声が検出されませんでした)';
+        }
+      } catch (error) {
+        currentTextEl.textContent = `エラー: ${error.message}`;
+      } finally {
+        isProcessing = false;
+      }
+    }
+    async function startOverlapRecording() {
+      try {
+        mediaStream = await navigator.mediaDevices.getUserMedia({
+          audio: { channelCount: 1, sampleRate: SAMPLE_RATE }
+        });
+        audioContext = new (window.AudioContext || window.webkitAudioContext)({
+          sampleRate: SAMPLE_RATE
+        });
+        const source = audioContext.createMediaStreamSource(mediaStream);
+        const bufferSize = 4096;
+        scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
+        const maxBufferSize = SAMPLE_RATE * 10;
+        scriptProcessor.onaudioprocess = (e) => {
+          if (!isRecording) return;
+          const inputData = e.inputBuffer.getChannelData(0);
+          for (let i = 0; i < inputData.length; i++) {
+            audioBuffer.push(inputData[i]);
+          }
+          while (audioBuffer.length > maxBufferSize) {
+            audioBuffer.shift();
+          }
+        };
+        source.connect(scriptProcessor);
+        scriptProcessor.connect(audioContext.destination);
+        audioBuffer = [];
+        lastTranscript = '';
+        isRecording = true;
+        isProcessing = false;
+        const intervalMs = parseFloat(intervalSlider.value) * 1000;
+        processTimer = setInterval(processAudioWindow, intervalMs);
+        updateRecordingUI();
+      } catch (error) {
+        statusEl.textContent = `マイクエラー: ${error.message}`;
+        statusEl.className = 'status error';
+      }
+    }
+    function stopOverlapRecording() {
+      isRecording = false;
+      if (processTimer) {
+        clearInterval(processTimer);
+        processTimer = null;
+      }
+      if (scriptProcessor) {
+        scriptProcessor.disconnect();
+        scriptProcessor = null;
+      }
+      if (audioContext) {
+        audioContext.close();
+        audioContext = null;
+      }
+      if (mediaStream) {
+        mediaStream.getTracks().forEach(track => track.stop());
+        mediaStream = null;
+      }
+      audioBuffer = [];
+    }
+    // ============ UI Helpers ============
+    function updateRecordingUI() {
+      statusEl.textContent = '録音中... マイクに向かって話してください';
+      statusEl.className = 'status recording';
+      startBtn.textContent = '録音停止';
+      startBtn.classList.add('recording');
+      modeChunkBtn.disabled = true;
+      modeOverlapBtn.disabled = true;
+    }
+    function updateStoppedUI() {
       statusEl.textContent = '録音停止。再開するにはボタンをクリック';
       statusEl.className = 'status ready';
       startBtn.textContent = '録音開始';
       startBtn.classList.remove('recording');
       currentTextEl.textContent = '';
+      modeChunkBtn.disabled = false;
+      modeOverlapBtn.disabled = false;
+    }
+    // ============ Main Controls ============
+    function startRecording() {
+      if (currentMode === 'chunk') {
+        startChunkRecording();
+      } else {
+        startOverlapRecording();
+      }
+    }
+    function stopRecording() {
+      if (currentMode === 'chunk') {
+        stopChunkRecording();
+      } else {
+        stopOverlapRecording();
+      }
+      updateStoppedUI();
     }
     startBtn.addEventListener('click', () => {