// ==UserScript==
// @name T3Chat OpenAI TTS & STT
// @namespace https://github.com/cameron/t3chat-userscripts
// @version 0.1.2
// @description Adds OpenAI text-to-speech and speech-to-text to T3Chat
// @match https://t3.chat/*
// @match https://*.t3.chat/*
// @run-at document-idle
// @grant none
// @license MIT
// ==/UserScript==
(() => {
'use strict';
const CONFIG = {
apiBaseUrl: 'https://api.openai.com/v1',
ttsModel: 'tts-1',
ttsVoice: 'alloy',
sttModel: 'whisper-1',
maxRecordingTime: 60000,
currentVersion: '0.1.2',
storageKeys: {
t3chatApiKey: 'apikey:openai',
ttsEnabled: 't3chat-tts-enabled',
sttEnabled: 't3chat-stt-enabled',
ttsVoice: 't3chat-tts-voice',
sttMethod: 't3chat-stt-method',
version: 't3chat-tts-stt-version'
}
};
if (localStorage.getItem(CONFIG.storageKeys.version) !== CONFIG.currentVersion) {
localStorage.removeItem(CONFIG.storageKeys.sttMethod);
localStorage.setItem(CONFIG.storageKeys.version, CONFIG.currentVersion);
}
const SELECTORS = {
chatInput: [
'#chat-input',
'textarea[aria-describedby="chat-input-description"]',
'textarea[placeholder*="message"]',
'textarea[data-testid="chat-input"]'
],
messageContainer: '[role="article"], .message, div[class*="message"]',
messageContent: '.prose, .message-content, div[class*="prose"], p, div[class*="text"]',
messageActionsContainer:
'div[class*="absolute"][class*="flex"][class*="items-center"][class*="gap"], div.absolute.left-0[class*="-ml-0"][class*="mt-2"], div.absolute.right-0[class*="mt-"]',
sendButton: 'button[type="submit"][aria-label*="Message"], button[aria-label*="send" i]'
};
const getT3ChatApiKey = () => {
const key = localStorage.getItem(CONFIG.storageKeys.t3chatApiKey);
return key?.startsWith('sk-') ? key : null;
};
const state = {
get apiKey() {
return getT3ChatApiKey();
},
ttsEnabled: localStorage.getItem(CONFIG.storageKeys.ttsEnabled) !== 'false',
sttEnabled: localStorage.getItem(CONFIG.storageKeys.sttEnabled) !== 'false',
sttMethod: localStorage.getItem(CONFIG.storageKeys.sttMethod) || 'openai',
ttsVoice: localStorage.getItem(CONFIG.storageKeys.ttsVoice) || CONFIG.ttsVoice,
isRecording: false,
mediaRecorder: null,
audioChunks: [],
currentAudio: null,
recordingMimeType: '',
speechRecognition: null
};
if (localStorage.getItem(CONFIG.storageKeys.ttsEnabled) === null) {
localStorage.setItem(CONFIG.storageKeys.ttsEnabled, 'true');
state.ttsEnabled = true;
}
if (localStorage.getItem(CONFIG.storageKeys.sttEnabled) === null) {
localStorage.setItem(CONFIG.storageKeys.sttEnabled, 'true');
state.sttEnabled = true;
}
const findChatInput = () =>
SELECTORS.chatInput
.map((s) => document.querySelector(s))
.find((el) => el && el.tagName === 'TEXTAREA');
const findInputContainer = () => {
const input = findChatInput();
if (!input) return null;
const sendBtn =
document.querySelector(SELECTORS.sendButton) ||
input.parentElement?.querySelector('button[type="submit"]') ||
input.parentElement?.querySelector('button[aria-label*="send" i]');
return sendBtn ? sendBtn.parentElement : input.closest('div[class*="flex"]') || input.parentElement;
};
const injectStyles = () => {
if (document.querySelector('#t3chat-tts-stt-styles')) return;
const style = document.createElement('style');
style.id = 't3chat-tts-stt-styles';
style.textContent = `
.t3-tts-btn,.t3-stt-btn,.t3-settings-btn{
display:flex;align-items:center;justify-content:center;width:32px;height:32px;border:1px solid hsl(var(--border));
border-radius:6px;background:hsl(var(--background));color:hsl(var(--foreground));cursor:pointer;
transition:all .2s ease;position:relative;flex-shrink:0
}
.t3-tts-btn:hover,.t3-stt-btn:hover,.t3-settings-btn:hover{background:hsl(var(--muted));border-color:hsl(var(--ring))}
.t3-stt-btn.recording{background:#ef4444;color:#fff;animation:pulse 1s infinite}
.t3-tts-btn.speaking{background:#3b82f6;color:#fff}
.t3-tts-btn.disabled,.t3-stt-btn.disabled{opacity:.5;cursor:not-allowed}
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.7}}
.t3-tooltip{position:absolute;bottom:100%;left:50%;transform:translateX(-50%);background:hsl(var(--foreground));
color:hsl(var(--background));padding:4px 8px;border-radius:4px;font-size:12px;white-space:nowrap;opacity:0;
pointer-events:none;transition:opacity .2s ease;margin-bottom:4px;z-index:1000}
.t3-stt-btn:hover .t3-tooltip,.t3-settings-btn:hover .t3-tooltip{opacity:1}
button[aria-label="Speak message"].speaking{background:#3b82f6!important;color:#fff!important}
button[aria-label="Speak message"]{width:32px!important;height:32px!important;min-width:32px!important;min-height:32px!important;
display:flex!important;align-items:center!important;justify-content:center!important}
button[aria-label="Speak message"] .relative,button[aria-label="Speak message"] svg{width:24px!important;height:24px!important}
`;
document.head.appendChild(style);
};
const callOpenAI = async (endpoint, data, options = {}) => {
if (!state.apiKey) throw new Error('OpenAI API key not configured');
const res = await fetch(`${CONFIG.apiBaseUrl}${endpoint}`, {
method: 'POST',
headers: {
Authorization: `Bearer ${state.apiKey}`,
'Content-Type': 'application/json',
...options.headers
},
body: JSON.stringify(data),
...options
});
if (!res.ok) {
const err = await res.json().catch(() => ({ error: { message: `HTTP ${res.status}` } }));
throw new Error(err.error?.message || `HTTP ${res.status}`);
}
return res;
};
const textToSpeech = async (text) => {
const res = await callOpenAI('/audio/speech', {
model: CONFIG.ttsModel,
voice: state.ttsVoice,
input: text.slice(0, 4096)
});
const blob = await res.blob();
const url = URL.createObjectURL(blob);
if (state.currentAudio) {
state.currentAudio.pause();
URL.revokeObjectURL(state.currentAudio.src);
}
state.currentAudio = new Audio(url);
return state.currentAudio;
};
const speechToText = async (blob) => {
const mime = blob.type.toLowerCase();
const ext =
mime.includes('wav')
? 'wav'
: mime.includes('mp4')
? 'mp4'
: mime.includes('mp3')
? 'mp3'
: mime.includes('ogg')
? 'ogg'
: 'webm';
const form = new FormData();
form.append('file', blob, `audio.${ext}`);
form.append('model', CONFIG.sttModel);
const res = await fetch(`${CONFIG.apiBaseUrl}/audio/transcriptions`, {
method: 'POST',
headers: { Authorization: `Bearer ${state.apiKey}` },
body: form
});
if (!res.ok) {
const txt = await res.text();
throw new Error(`STT failed: ${txt}`);
}
const json = await res.json();
return json.text;
};
const initSpeechRecognition = () => {
const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
if (!SR) return null;
const rec = new SR();
rec.continuous = false;
rec.interimResults = false;
rec.maxAlternatives = 1;
rec.lang = 'en-US';
rec.onstart = () => {
state.isRecording = true;
updateSTTButton();
};
rec.onresult = (e) => {
const txt = e.results[0][0].transcript;
const input = findChatInput();
if (input && txt.trim()) {
input.value = (input.value + ' ' + txt).trim();
input.dispatchEvent(new Event('input', { bubbles: true }));
input.focus();
}
};
rec.onerror = rec.onend = () => {
state.isRecording = false;
updateSTTButton();
};
return rec;
};
const startRecording = async () => {
if (state.sttMethod === 'browser') return startBrowserSpeechRecognition();
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const types = [
'audio/wav',
'audio/mp4',
'audio/webm;codecs=opus',
'audio/webm',
'audio/ogg;codecs=opus',
'audio/mp3'
];
const type = types.find((t) => MediaRecorder.isTypeSupported(t)) || '';
if (!type) throw new Error('No supported audio MIME type found');
state.mediaRecorder = new MediaRecorder(stream, { mimeType: type });
state.audioChunks = [];
state.recordingMimeType = type;
state.mediaRecorder.ondataavailable = (e) => e.data.size && state.audioChunks.push(e.data);
state.mediaRecorder.onstop = async () => {
const blob = new Blob(state.audioChunks, { type: state.recordingMimeType });
try {
const txt = await speechToText(blob);
const input = findChatInput();
if (input && txt.trim()) {
input.value = (input.value + ' ' + txt).trim();
input.dispatchEvent(new Event('input', { bubbles: true }));
input.focus();
}
} finally {
stream.getTracks().forEach((t) => t.stop());
state.isRecording = false;
updateSTTButton();
}
};
state.mediaRecorder.start();
state.isRecording = true;
updateSTTButton();
setTimeout(() => state.isRecording && stopRecording(), CONFIG.maxRecordingTime);
} catch (err) {}
};
const startBrowserSpeechRecognition = () => {
if (!state.speechRecognition) state.speechRecognition = initSpeechRecognition();
state.speechRecognition?.start();
};
const stopRecording = () => {
if (state.sttMethod === 'browser') {
state.speechRecognition?.stop();
} else {
state.mediaRecorder?.stop();
}
};
const createButton = (cls, svg, tooltip) => {
const btn = document.createElement('button');
btn.className = cls;
btn.innerHTML = `${svg}<div class="t3-tooltip">${tooltip}</div>`;
return btn;
};
const createTTSButton = () => {
const svg =
'<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polygon points="11 5,6 9,2 9,2 15,6 15,11 19,11 5"></polygon><path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path><path d="M19.07 4.93a10 10 0 0 1 0 14.14"></path></svg>';
const btn = createButton('t3-tts-btn', svg, 'Text to Speech');
btn.addEventListener('click', async () => {
const input = findChatInput();
if (input?.value.trim()) await speakText(input.value.trim());
});
return btn;
};
const createSTTButton = () => {
const svg =
'<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 2a3 3 0 0 0-3 3v7a3 3 0 0 0 6 0V5a3 3 0 0 0-3-3Z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" x2="12" y1="19" y2="22"></line><line x1="8" x2="16" y1="22" y2="22"></line></svg>';
const btn = createButton('t3-stt-btn', svg, 'Speech to Text');
btn.addEventListener('click', () => (state.isRecording ? stopRecording() : startRecording()));
return btn;
};
const createSettingsButton = () => {
const svg =
'<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12.22 2h-.44a2 2 0 0 0-2 2v.18a2 2 0 0 1-1 1.73l-.43.25a2 2 0 0 1-2 0l-.15-.08a2 2 0 0 0-2.73.73l-.22.38a2 2 0 0 0 .73 2.73l.15.1a2 2 0 0 1 1 1.72v.51a2 2 0 0 1-1 1.74l-.15.09a2 2 0 0 0-.73 2.73l.22.38a2 2 0 0 0 2.73.73l.15-.08a2 2 0 0 1 2 0l.43.25a2 2 0 0 1 1 1.73V20a2 2 0 0 0 2 2h.44a2 2 0 0 0 2-2v-.18a2 2 0 0 1 1-1.73l.43-.25a2 2 0 0 1 2 0l.15.08a2 2 0 0 0 2.73-.73l.22-.39a2 2 0 0 0-.73-2.73l-.15-.08a2 2 0 0 1-1-1.74v-.5a2 2 0 0 1 1-1.74l.15-.09a2 2 0 0 0 .73-2.73l-.22-.38a2 2 0 0 0-2.73-.73l-.15.08a2 2 0 0 1-2 0l-.43-.25a2 2 0 0 1-1-1.73V4a2 2 0 0 0-2-2z"></path><circle cx="12" cy="12" r="3"></circle></svg>';
const btn = createButton('t3-settings-btn', svg, 'TTS/STT Settings');
btn.addEventListener('click', showSettingsModal);
return btn;
};
const createMessageSpeakButton = (msg) => {
const btn = document.createElement('button');
btn.className =
'inline-flex items-center justify-center text-xs rounded-lg p-0 hover:bg-muted/40';
btn.setAttribute('aria-label', 'Speak message');
btn.innerHTML =
'<div class="relative" style="width:24px;height:24px"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"><polygon points="11 5,6 9,2 9,2 15,6 15,11 19,11 5"></polygon><path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path></svg></div>';
btn.addEventListener('click', () => {
const text = msg.textContent.trim();
if (!text) return;
btn.classList.add('speaking');
speakText(text).finally(() => btn.classList.remove('speaking'));
});
return btn;
};
const speakText = async (txt) => {
try {
const audio = await textToSpeech(txt);
await audio.play();
} catch (err) {}
};
const updateSTTButton = () => {
const btn = document.querySelector('.t3-stt-btn');
if (!btn) return;
btn.classList.toggle('recording', state.isRecording);
const tip = btn.querySelector('.t3-tooltip');
if (tip) tip.textContent = state.isRecording ? 'Stop Recording' : 'Speech to Text';
};
const showSettingsModal = () => {
const hasKey = !!state.apiKey;
const modal = document.createElement('div');
modal.className = 't3-settings-modal';
modal.innerHTML = `
<style>
.t3-settings-modal{position:fixed;inset:0;background:rgba(0,0,0,.5);display:flex;align-items:center;justify-content:center;z-index:10000}
.t3-settings-content{background:hsl(var(--background));border:1px solid hsl(var(--border));border-radius:8px;padding:24px;min-width:400px;max-width:500px}
.t3-settings-title{font-size:18px;font-weight:600;margin-bottom:16px;color:hsl(var(--foreground))}
.t3-form-group{margin-bottom:16px}
.t3-form-label{display:block;font-size:14px;font-weight:500;margin-bottom:4px;color:hsl(var(--foreground))}
.t3-form-select,.t3-form-input{width:100%;padding:8px 12px;border:1px solid hsl(var(--border));border-radius:6px;background:hsl(var(--background));color:hsl(var(--foreground));font-size:14px}
.t3-form-checkbox{display:flex;align-items:center;gap:8px}
.t3-button-group{display:flex;gap:8px;justify-content:flex-end;margin-top:20px}
.t3-btn{padding:8px 16px;border-radius:6px;border:1px solid hsl(var(--border));background:hsl(var(--background));color:hsl(var(--foreground));cursor:pointer;font-size:14px;transition:all .2s ease}
.t3-btn:hover{background:hsl(var(--muted))}
.t3-btn.primary{background:hsl(var(--primary));color:hsl(var(--primary-foreground));border-color:hsl(var(--primary))}
.t3-btn.primary:hover{opacity:.9}
.t3-api-key-status{padding:12px;border-radius:6px;background:hsl(var(--muted));border:1px solid hsl(var(--border))}
.t3-api-status{font-weight:500;margin-top:4px}
.t3-api-status.connected{color:#22c55e}
.t3-api-status.disconnected{color:#ef4444}
.t3-form-help{font-size:12px;color:hsl(var(--muted-foreground));margin-top:8px}
</style>
<div class="t3-settings-content">
<div class="t3-settings-title">TTS & STT Settings</div>
<div class="t3-form-group">
<div class="t3-api-key-status">
<div class="t3-form-label">OpenAI API Key Status</div>
<div class="t3-api-status ${hasKey ? 'connected' : 'disconnected'}">
${hasKey ? '✅ Connected' : '❌ Not configured'}
</div>
${hasKey ? '' : '<p class="t3-form-help">Add your OpenAI key in T3Chat settings.</p>'}
</div>
</div>
<div class="t3-form-group">
<label class="t3-form-label">STT Method</label>
<select class="t3-form-select" id="stt-method-select">
<option value="browser" ${state.sttMethod === 'browser' ? 'selected' : ''}>Browser</option>
<option value="openai" ${state.sttMethod === 'openai' ? 'selected' : ''} ${!hasKey ? 'disabled' : ''}>OpenAI Whisper</option>
</select>
</div>
<div class="t3-form-group">
<label class="t3-form-label">TTS Voice</label>
<select class="t3-form-select" id="voice-select" ${!hasKey ? 'disabled' : ''}>
${['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
.map((v) => `<option value="${v}" ${state.ttsVoice === v ? 'selected' : ''}>${v[0].toUpperCase() + v.slice(1)}</option>`)
.join('')}
</select>
</div>
<div class="t3-form-group">
<label class="t3-form-checkbox"><input type="checkbox" id="tts-enabled" ${state.ttsEnabled ? 'checked' : ''}><span>Enable Text-to-Speech</span></label>
</div>
<div class="t3-form-group">
<label class="t3-form-checkbox"><input type="checkbox" id="stt-enabled" ${state.sttEnabled ? 'checked' : ''}><span>Enable Speech-to-Text</span></label>
</div>
<div class="t3-button-group">
<button class="t3-btn" id="cancel-settings">Cancel</button>
<button class="t3-btn primary" id="save-settings">Save</button>
</div>
</div>`;
modal.addEventListener('click', (e) => e.target === modal && modal.remove());
modal.querySelector('#cancel-settings').addEventListener('click', () => modal.remove());
modal.querySelector('#save-settings').addEventListener('click', () => {
const voice = modal.querySelector('#voice-select').value;
const ttsEnabled = modal.querySelector('#tts-enabled').checked;
const sttEnabled = modal.querySelector('#stt-enabled').checked;
const method = modal.querySelector('#stt-method-select').value;
state.ttsVoice = voice;
state.ttsEnabled = ttsEnabled;
state.sttEnabled = sttEnabled;
state.sttMethod = method;
localStorage.setItem(CONFIG.storageKeys.ttsVoice, voice);
localStorage.setItem(CONFIG.storageKeys.ttsEnabled, ttsEnabled);
localStorage.setItem(CONFIG.storageKeys.sttEnabled, sttEnabled);
localStorage.setItem(CONFIG.storageKeys.sttMethod, method);
updateControlsVisibility();
modal.remove();
});
document.body.appendChild(modal);
};
const updateControlsVisibility = () => {
const stt = document.querySelector('.t3-stt-btn');
if (!stt) return;
stt.style.display = state.sttEnabled ? 'flex' : 'none';
stt.classList.toggle('disabled', !state.apiKey);
};
const addControlsToInput = () => {
const container = findInputContainer();
if (!container || container.querySelector('.t3-settings-btn')) return;
const sendBtn =
container.querySelector(SELECTORS.sendButton) ||
container.querySelector('button[type="submit"]') ||
container.querySelector('button[aria-label*="send" i]');
const settingsBtn = createSettingsButton();
if (sendBtn) container.insertBefore(settingsBtn, sendBtn);
else container.appendChild(settingsBtn);
if (state.sttEnabled) {
const sttBtn = createSTTButton();
sendBtn ? container.insertBefore(sttBtn, sendBtn) : container.appendChild(sttBtn);
}
updateControlsVisibility();
};
const processMessage = (msg) => {
const content = msg.querySelector(SELECTORS.messageContent);
if (!content || !content.textContent.trim() || !state.ttsEnabled) return;
let actions =
msg.parentElement?.querySelector(SELECTORS.messageActionsContainer) ||
msg.querySelector(SELECTORS.messageActionsContainer);
if (!actions) actions = msg.parentElement?.querySelector('div[class*="absolute"][class*="flex"]');
if (!actions || actions.querySelector('button[aria-label="Speak message"]')) return;
const speakBtn = createMessageSpeakButton(content);
const genTxt = actions.querySelector('span[class*="select-none"]');
if (genTxt) actions.insertBefore(speakBtn, genTxt);
else {
const first = actions.querySelector('button');
first?.nextSibling ? actions.insertBefore(speakBtn, first.nextSibling) : actions.appendChild(speakBtn);
}
msg.setAttribute('data-tts-added', 'true');
};
const addTTSToMessages = () => {
document
.querySelectorAll(`${SELECTORS.messageContainer}:not([data-tts-added])`)
.forEach(processMessage);
};
const initialize = () => {
injectStyles();
addControlsToInput();
addTTSToMessages();
new MutationObserver(() => {
addControlsToInput();
addTTSToMessages();
}).observe(document.documentElement, { childList: true, subtree: true });
setTimeout(addTTSToMessages, 2000);
};
document.readyState === 'loading'
? document.addEventListener('DOMContentLoaded', initialize)
: initialize();
})();