T3Chat OpenAI TTS & STT

Adds OpenAI text-to-speech and speech-to-text to T3Chat

You will need to install an extension such as Tampermonkey, Greasemonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey or Userscripts to install this script.

You will need to install an extension such as Tampermonkey to install this script.

You will need to install a user script manager extension to install this script.

(I already have a user script manager, let me install it!)

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

(I already have a user style manager, let me install it!)

// ==UserScript==
// @name         T3Chat OpenAI TTS & STT
// @namespace    https://github.com/cameron/t3chat-userscripts
// @version      0.1.2
// @description  Adds OpenAI text-to-speech and speech-to-text to T3Chat
// @match        https://t3.chat/*
// @match        https://*.t3.chat/*
// @run-at       document-idle
// @grant        none
// @license      MIT
// ==/UserScript==

(() => {
  'use strict';

  const CONFIG = {
    apiBaseUrl: 'https://api.openai.com/v1',
    ttsModel: 'tts-1',
    ttsVoice: 'alloy',
    sttModel: 'whisper-1',
    maxRecordingTime: 60000,
    currentVersion: '0.1.2',
    storageKeys: {
      t3chatApiKey: 'apikey:openai',
      ttsEnabled: 't3chat-tts-enabled',
      sttEnabled: 't3chat-stt-enabled',
      ttsVoice: 't3chat-tts-voice',
      sttMethod: 't3chat-stt-method',
      version: 't3chat-tts-stt-version'
    }
  };

  if (localStorage.getItem(CONFIG.storageKeys.version) !== CONFIG.currentVersion) {
    localStorage.removeItem(CONFIG.storageKeys.sttMethod);
    localStorage.setItem(CONFIG.storageKeys.version, CONFIG.currentVersion);
  }

  const SELECTORS = {
    chatInput: [
      '#chat-input',
      'textarea[aria-describedby="chat-input-description"]',
      'textarea[placeholder*="message"]',
      'textarea[data-testid="chat-input"]'
    ],
    messageContainer: '[role="article"], .message, div[class*="message"]',
    messageContent: '.prose, .message-content, div[class*="prose"], p, div[class*="text"]',
    messageActionsContainer:
      'div[class*="absolute"][class*="flex"][class*="items-center"][class*="gap"], div.absolute.left-0[class*="-ml-0"][class*="mt-2"], div.absolute.right-0[class*="mt-"]',
    sendButton: 'button[type="submit"][aria-label*="Message"], button[aria-label*="send" i]'
  };

  const getT3ChatApiKey = () => {
    const key = localStorage.getItem(CONFIG.storageKeys.t3chatApiKey);
    return key?.startsWith('sk-') ? key : null;
  };

  const state = {
    get apiKey() {
      return getT3ChatApiKey();
    },
    ttsEnabled: localStorage.getItem(CONFIG.storageKeys.ttsEnabled) !== 'false',
    sttEnabled: localStorage.getItem(CONFIG.storageKeys.sttEnabled) !== 'false',
    sttMethod: localStorage.getItem(CONFIG.storageKeys.sttMethod) || 'openai',
    ttsVoice: localStorage.getItem(CONFIG.storageKeys.ttsVoice) || CONFIG.ttsVoice,
    isRecording: false,
    mediaRecorder: null,
    audioChunks: [],
    currentAudio: null,
    recordingMimeType: '',
    speechRecognition: null
  };

  if (localStorage.getItem(CONFIG.storageKeys.ttsEnabled) === null) {
    localStorage.setItem(CONFIG.storageKeys.ttsEnabled, 'true');
    state.ttsEnabled = true;
  }
  if (localStorage.getItem(CONFIG.storageKeys.sttEnabled) === null) {
    localStorage.setItem(CONFIG.storageKeys.sttEnabled, 'true');
    state.sttEnabled = true;
  }

  const findChatInput = () =>
    SELECTORS.chatInput
      .map((s) => document.querySelector(s))
      .find((el) => el && el.tagName === 'TEXTAREA');

  const findInputContainer = () => {
    const input = findChatInput();
    if (!input) return null;
    const sendBtn =
      document.querySelector(SELECTORS.sendButton) ||
      input.parentElement?.querySelector('button[type="submit"]') ||
      input.parentElement?.querySelector('button[aria-label*="send" i]');
    return sendBtn ? sendBtn.parentElement : input.closest('div[class*="flex"]') || input.parentElement;
  };

  const injectStyles = () => {
    if (document.querySelector('#t3chat-tts-stt-styles')) return;
    const style = document.createElement('style');
    style.id = 't3chat-tts-stt-styles';
    style.textContent = `
      .t3-tts-btn,.t3-stt-btn,.t3-settings-btn{
        display:flex;align-items:center;justify-content:center;width:32px;height:32px;border:1px solid hsl(var(--border));
        border-radius:6px;background:hsl(var(--background));color:hsl(var(--foreground));cursor:pointer;
        transition:all .2s ease;position:relative;flex-shrink:0
      }
      .t3-tts-btn:hover,.t3-stt-btn:hover,.t3-settings-btn:hover{background:hsl(var(--muted));border-color:hsl(var(--ring))}
      .t3-stt-btn.recording{background:#ef4444;color:#fff;animation:pulse 1s infinite}
      .t3-tts-btn.speaking{background:#3b82f6;color:#fff}
      .t3-tts-btn.disabled,.t3-stt-btn.disabled{opacity:.5;cursor:not-allowed}
      @keyframes pulse{0%,100%{opacity:1}50%{opacity:.7}}
      .t3-tooltip{position:absolute;bottom:100%;left:50%;transform:translateX(-50%);background:hsl(var(--foreground));
        color:hsl(var(--background));padding:4px 8px;border-radius:4px;font-size:12px;white-space:nowrap;opacity:0;
        pointer-events:none;transition:opacity .2s ease;margin-bottom:4px;z-index:1000}
      .t3-stt-btn:hover .t3-tooltip,.t3-settings-btn:hover .t3-tooltip{opacity:1}
      button[aria-label="Speak message"].speaking{background:#3b82f6!important;color:#fff!important}
      button[aria-label="Speak message"]{width:32px!important;height:32px!important;min-width:32px!important;min-height:32px!important;
        display:flex!important;align-items:center!important;justify-content:center!important}
      button[aria-label="Speak message"] .relative,button[aria-label="Speak message"] svg{width:24px!important;height:24px!important}
    `;
    document.head.appendChild(style);
  };

  const callOpenAI = async (endpoint, data, options = {}) => {
    if (!state.apiKey) throw new Error('OpenAI API key not configured');
    const res = await fetch(`${CONFIG.apiBaseUrl}${endpoint}`, {
      method: 'POST',
      headers: {
        Authorization: `Bearer ${state.apiKey}`,
        'Content-Type': 'application/json',
        ...options.headers
      },
      body: JSON.stringify(data),
      ...options
    });
    if (!res.ok) {
      const err = await res.json().catch(() => ({ error: { message: `HTTP ${res.status}` } }));
      throw new Error(err.error?.message || `HTTP ${res.status}`);
    }
    return res;
  };

  const textToSpeech = async (text) => {
    const res = await callOpenAI('/audio/speech', {
      model: CONFIG.ttsModel,
      voice: state.ttsVoice,
      input: text.slice(0, 4096)
    });
    const blob = await res.blob();
    const url = URL.createObjectURL(blob);
    if (state.currentAudio) {
      state.currentAudio.pause();
      URL.revokeObjectURL(state.currentAudio.src);
    }
    state.currentAudio = new Audio(url);
    return state.currentAudio;
  };

  const speechToText = async (blob) => {
    const mime = blob.type.toLowerCase();
    const ext =
      mime.includes('wav')
        ? 'wav'
        : mime.includes('mp4')
        ? 'mp4'
        : mime.includes('mp3')
        ? 'mp3'
        : mime.includes('ogg')
        ? 'ogg'
        : 'webm';

    const form = new FormData();
    form.append('file', blob, `audio.${ext}`);
    form.append('model', CONFIG.sttModel);

    const res = await fetch(`${CONFIG.apiBaseUrl}/audio/transcriptions`, {
      method: 'POST',
      headers: { Authorization: `Bearer ${state.apiKey}` },
      body: form
    });
    if (!res.ok) {
      const txt = await res.text();
      throw new Error(`STT failed: ${txt}`);
    }
    const json = await res.json();
    return json.text;
  };

  const initSpeechRecognition = () => {
    const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
    if (!SR) return null;
    const rec = new SR();
    rec.continuous = false;
    rec.interimResults = false;
    rec.maxAlternatives = 1;
    rec.lang = 'en-US';

    rec.onstart = () => {
      state.isRecording = true;
      updateSTTButton();
    };
    rec.onresult = (e) => {
      const txt = e.results[0][0].transcript;
      const input = findChatInput();
      if (input && txt.trim()) {
        input.value = (input.value + ' ' + txt).trim();
        input.dispatchEvent(new Event('input', { bubbles: true }));
        input.focus();
      }
    };
    rec.onerror = rec.onend = () => {
      state.isRecording = false;
      updateSTTButton();
    };
    return rec;
  };

  const startRecording = async () => {
    if (state.sttMethod === 'browser') return startBrowserSpeechRecognition();
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      const types = [
        'audio/wav',
        'audio/mp4',
        'audio/webm;codecs=opus',
        'audio/webm',
        'audio/ogg;codecs=opus',
        'audio/mp3'
      ];
      const type = types.find((t) => MediaRecorder.isTypeSupported(t)) || '';
      if (!type) throw new Error('No supported audio MIME type found');

      state.mediaRecorder = new MediaRecorder(stream, { mimeType: type });
      state.audioChunks = [];
      state.recordingMimeType = type;

      state.mediaRecorder.ondataavailable = (e) => e.data.size && state.audioChunks.push(e.data);
      state.mediaRecorder.onstop = async () => {
        const blob = new Blob(state.audioChunks, { type: state.recordingMimeType });
        try {
          const txt = await speechToText(blob);
          const input = findChatInput();
          if (input && txt.trim()) {
            input.value = (input.value + ' ' + txt).trim();
            input.dispatchEvent(new Event('input', { bubbles: true }));
            input.focus();
          }
        } finally {
          stream.getTracks().forEach((t) => t.stop());
          state.isRecording = false;
          updateSTTButton();
        }
      };
      state.mediaRecorder.start();
      state.isRecording = true;
      updateSTTButton();
      setTimeout(() => state.isRecording && stopRecording(), CONFIG.maxRecordingTime);
    } catch (err) {}
  };

  const startBrowserSpeechRecognition = () => {
    if (!state.speechRecognition) state.speechRecognition = initSpeechRecognition();
    state.speechRecognition?.start();
  };

  const stopRecording = () => {
    if (state.sttMethod === 'browser') {
      state.speechRecognition?.stop();
    } else {
      state.mediaRecorder?.stop();
    }
  };

  const createButton = (cls, svg, tooltip) => {
    const btn = document.createElement('button');
    btn.className = cls;
    btn.innerHTML = `${svg}<div class="t3-tooltip">${tooltip}</div>`;
    return btn;
  };

  const createTTSButton = () => {
    const svg =
      '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polygon points="11 5,6 9,2 9,2 15,6 15,11 19,11 5"></polygon><path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path><path d="M19.07 4.93a10 10 0 0 1 0 14.14"></path></svg>';
    const btn = createButton('t3-tts-btn', svg, 'Text to Speech');
    btn.addEventListener('click', async () => {
      const input = findChatInput();
      if (input?.value.trim()) await speakText(input.value.trim());
    });
    return btn;
  };

  const createSTTButton = () => {
    const svg =
      '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" x2="12" y1="19" y2="22"></line><line x1="8" x2="16" y1="22" y2="22"></line></svg>';
    const btn = createButton('t3-stt-btn', svg, 'Speech to Text');
    btn.addEventListener('click', () => (state.isRecording ? stopRecording() : startRecording()));
    return btn;
  };

  const createSettingsButton = () => {
    const svg =
      '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12.22 2h-.44a2 2 0 0 0-2 2v.18a2 2 0 0 1-1 1.73l-.43.25a2 2 0 0 1-2 0l-.15-.08a2 2 0 0 0-2.73.73l-.22.38a2 2 0 0 0 .73 2.73l.15.1a2 2 0 0 1 1 1.72v.51a2 2 0 0 1-1 1.74l-.15.09a2 2 0 0 0-.73 2.73l.22.38a2 2 0 0 0 2.73.73l.15-.08a2 2 0 0 1 2 0l.43.25a2 2 0 0 1 1 1.73V20a2 2 0 0 0 2 2h.44a2 2 0 0 0 2-2v-.18a2 2 0 0 1 1-1.73l.43-.25a2 2 0 0 1 2 0l.15.08a2 2 0 0 0 2.73-.73l.22-.39a2 2 0 0 0-.73-2.73l-.15-.08a2 2 0 0 1-1-1.74v-.5a2 2 0 0 1 1-1.74l.15-.09a2 2 0 0 0 .73-2.73l-.22-.38a2 2 0 0 0-2.73-.73l-.15.08a2 2 0 0 1-2 0l-.43-.25a2 2 0 0 1-1-1.73V4a2 2 0 0 0-2-2z"></path><circle cx="12" cy="12" r="3"></circle></svg>';
    const btn = createButton('t3-settings-btn', svg, 'TTS/STT Settings');
    btn.addEventListener('click', showSettingsModal);
    return btn;
  };

  const createMessageSpeakButton = (msg) => {
    const btn = document.createElement('button');
    btn.className =
      'inline-flex items-center justify-center text-xs rounded-lg p-0 hover:bg-muted/40';
    btn.setAttribute('aria-label', 'Speak message');
    btn.innerHTML =
      '<div class="relative" style="width:24px;height:24px"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"><polygon points="11 5,6 9,2 9,2 15,6 15,11 19,11 5"></polygon><path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path></svg></div>';
    btn.addEventListener('click', () => {
      const text = msg.textContent.trim();
      if (!text) return;
      btn.classList.add('speaking');
      speakText(text).finally(() => btn.classList.remove('speaking'));
    });
    return btn;
  };

  const speakText = async (txt) => {
    try {
      const audio = await textToSpeech(txt);
      await audio.play();
    } catch (err) {}
  };

  const updateSTTButton = () => {
    const btn = document.querySelector('.t3-stt-btn');
    if (!btn) return;
    btn.classList.toggle('recording', state.isRecording);
    const tip = btn.querySelector('.t3-tooltip');
    if (tip) tip.textContent = state.isRecording ? 'Stop Recording' : 'Speech to Text';
  };

  const showSettingsModal = () => {
    const hasKey = !!state.apiKey;
    const modal = document.createElement('div');
    modal.className = 't3-settings-modal';
    modal.innerHTML = `
      <style>
        .t3-settings-modal{position:fixed;inset:0;background:rgba(0,0,0,.5);display:flex;align-items:center;justify-content:center;z-index:10000}
        .t3-settings-content{background:hsl(var(--background));border:1px solid hsl(var(--border));border-radius:8px;padding:24px;min-width:400px;max-width:500px}
        .t3-settings-title{font-size:18px;font-weight:600;margin-bottom:16px;color:hsl(var(--foreground))}
        .t3-form-group{margin-bottom:16px}
        .t3-form-label{display:block;font-size:14px;font-weight:500;margin-bottom:4px;color:hsl(var(--foreground))}
        .t3-form-select,.t3-form-input{width:100%;padding:8px 12px;border:1px solid hsl(var(--border));border-radius:6px;background:hsl(var(--background));color:hsl(var(--foreground));font-size:14px}
        .t3-form-checkbox{display:flex;align-items:center;gap:8px}
        .t3-button-group{display:flex;gap:8px;justify-content:flex-end;margin-top:20px}
        .t3-btn{padding:8px 16px;border-radius:6px;border:1px solid hsl(var(--border));background:hsl(var(--background));color:hsl(var(--foreground));cursor:pointer;font-size:14px;transition:all .2s ease}
        .t3-btn:hover{background:hsl(var(--muted))}
        .t3-btn.primary{background:hsl(var(--primary));color:hsl(var(--primary-foreground));border-color:hsl(var(--primary))}
        .t3-btn.primary:hover{opacity:.9}
        .t3-api-key-status{padding:12px;border-radius:6px;background:hsl(var(--muted));border:1px solid hsl(var(--border))}
        .t3-api-status{font-weight:500;margin-top:4px}
        .t3-api-status.connected{color:#22c55e}
        .t3-api-status.disconnected{color:#ef4444}
        .t3-form-help{font-size:12px;color:hsl(var(--muted-foreground));margin-top:8px}
      </style>
      <div class="t3-settings-content">
        <div class="t3-settings-title">TTS & STT Settings</div>
        <div class="t3-form-group">
          <div class="t3-api-key-status">
            <div class="t3-form-label">OpenAI API Key Status</div>
            <div class="t3-api-status ${hasKey ? 'connected' : 'disconnected'}">
              ${hasKey ? '✅ Connected' : '❌ Not configured'}
            </div>
            ${hasKey ? '' : '<p class="t3-form-help">Add your OpenAI key in T3Chat settings.</p>'}
          </div>
        </div>
        <div class="t3-form-group">
          <label class="t3-form-label">STT Method</label>
          <select class="t3-form-select" id="stt-method-select">
            <option value="browser" ${state.sttMethod === 'browser' ? 'selected' : ''}>Browser</option>
            <option value="openai" ${state.sttMethod === 'openai' ? 'selected' : ''} ${!hasKey ? 'disabled' : ''}>OpenAI Whisper</option>
          </select>
        </div>
        <div class="t3-form-group">
          <label class="t3-form-label">TTS Voice</label>
          <select class="t3-form-select" id="voice-select" ${!hasKey ? 'disabled' : ''}>
            ${['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
              .map((v) => `<option value="${v}" ${state.ttsVoice === v ? 'selected' : ''}>${v[0].toUpperCase() + v.slice(1)}</option>`)
              .join('')}
          </select>
        </div>
        <div class="t3-form-group">
          <label class="t3-form-checkbox"><input type="checkbox" id="tts-enabled" ${state.ttsEnabled ? 'checked' : ''}><span>Enable Text-to-Speech</span></label>
        </div>
        <div class="t3-form-group">
          <label class="t3-form-checkbox"><input type="checkbox" id="stt-enabled" ${state.sttEnabled ? 'checked' : ''}><span>Enable Speech-to-Text</span></label>
        </div>
        <div class="t3-button-group">
          <button class="t3-btn" id="cancel-settings">Cancel</button>
          <button class="t3-btn primary" id="save-settings">Save</button>
        </div>
      </div>`;
    modal.addEventListener('click', (e) => e.target === modal && modal.remove());
    modal.querySelector('#cancel-settings').addEventListener('click', () => modal.remove());
    modal.querySelector('#save-settings').addEventListener('click', () => {
      const voice = modal.querySelector('#voice-select').value;
      const ttsEnabled = modal.querySelector('#tts-enabled').checked;
      const sttEnabled = modal.querySelector('#stt-enabled').checked;
      const method = modal.querySelector('#stt-method-select').value;
      state.ttsVoice = voice;
      state.ttsEnabled = ttsEnabled;
      state.sttEnabled = sttEnabled;
      state.sttMethod = method;
      localStorage.setItem(CONFIG.storageKeys.ttsVoice, voice);
      localStorage.setItem(CONFIG.storageKeys.ttsEnabled, ttsEnabled);
      localStorage.setItem(CONFIG.storageKeys.sttEnabled, sttEnabled);
      localStorage.setItem(CONFIG.storageKeys.sttMethod, method);
      updateControlsVisibility();
      modal.remove();
    });
    document.body.appendChild(modal);
  };

  const updateControlsVisibility = () => {
    const stt = document.querySelector('.t3-stt-btn');
    if (!stt) return;
    stt.style.display = state.sttEnabled ? 'flex' : 'none';
    stt.classList.toggle('disabled', !state.apiKey);
  };

  const addControlsToInput = () => {
    const container = findInputContainer();
    if (!container || container.querySelector('.t3-settings-btn')) return;
    const sendBtn =
      container.querySelector(SELECTORS.sendButton) ||
      container.querySelector('button[type="submit"]') ||
      container.querySelector('button[aria-label*="send" i]');

    const settingsBtn = createSettingsButton();
    if (sendBtn) container.insertBefore(settingsBtn, sendBtn);
    else container.appendChild(settingsBtn);

    if (state.sttEnabled) {
      const sttBtn = createSTTButton();
      sendBtn ? container.insertBefore(sttBtn, sendBtn) : container.appendChild(sttBtn);
    }
    updateControlsVisibility();
  };

  const processMessage = (msg) => {
    const content = msg.querySelector(SELECTORS.messageContent);
    if (!content || !content.textContent.trim() || !state.ttsEnabled) return;
    let actions =
      msg.parentElement?.querySelector(SELECTORS.messageActionsContainer) ||
      msg.querySelector(SELECTORS.messageActionsContainer);
    if (!actions) actions = msg.parentElement?.querySelector('div[class*="absolute"][class*="flex"]');
    if (!actions || actions.querySelector('button[aria-label="Speak message"]')) return;
    const speakBtn = createMessageSpeakButton(content);
    const genTxt = actions.querySelector('span[class*="select-none"]');
    if (genTxt) actions.insertBefore(speakBtn, genTxt);
    else {
      const first = actions.querySelector('button');
      first?.nextSibling ? actions.insertBefore(speakBtn, first.nextSibling) : actions.appendChild(speakBtn);
    }
    msg.setAttribute('data-tts-added', 'true');
  };

  const addTTSToMessages = () => {
    document
      .querySelectorAll(`${SELECTORS.messageContainer}:not([data-tts-added])`)
      .forEach(processMessage);
  };

  const initialize = () => {
    injectStyles();
    addControlsToInput();
    addTTSToMessages();
    new MutationObserver(() => {
      addControlsToInput();
      addTTSToMessages();
    }).observe(document.documentElement, { childList: true, subtree: true });
    setTimeout(addTTSToMessages, 2000);
  };

  document.readyState === 'loading'
    ? document.addEventListener('DOMContentLoaded', initialize)
    : initialize();
})();