文件
tennis-training-hub/server/_core/voiceTranscription.ts
2026-03-14 07:18:53 -04:00

285 行
7.9 KiB
TypeScript

/**
* Voice transcription helper using internal Speech-to-Text service
*
* Frontend implementation guide:
* 1. Capture audio using MediaRecorder API
* 2. Upload audio to storage (e.g., S3) to get URL
* 3. Call transcription with the URL
*
* Example usage:
* ```tsx
* // Frontend component
* const transcribeMutation = trpc.voice.transcribe.useMutation({
* onSuccess: (data) => {
* console.log(data.text); // Full transcription
* console.log(data.language); // Detected language
* console.log(data.segments); // Timestamped segments
* }
* });
*
* // After uploading audio to storage
* transcribeMutation.mutate({
* audioUrl: uploadedAudioUrl,
* language: 'en', // optional
* prompt: 'Transcribe the meeting' // optional
* });
* ```
*/
import { ENV } from "./env";
export type TranscribeOptions = {
audioUrl: string; // URL to the audio file (e.g., S3 URL)
language?: string; // Optional: specify language code (e.g., "en", "es", "zh")
prompt?: string; // Optional: custom prompt for the transcription
};
// Native Whisper API segment format
export type WhisperSegment = {
id: number;
seek: number;
start: number;
end: number;
text: string;
tokens: number[];
temperature: number;
avg_logprob: number;
compression_ratio: number;
no_speech_prob: number;
};
// Native Whisper API response format
export type WhisperResponse = {
task: "transcribe";
language: string;
duration: number;
text: string;
segments: WhisperSegment[];
};
export type TranscriptionResponse = WhisperResponse; // Return native Whisper API response directly
export type TranscriptionError = {
error: string;
code: "FILE_TOO_LARGE" | "INVALID_FORMAT" | "TRANSCRIPTION_FAILED" | "UPLOAD_FAILED" | "SERVICE_ERROR";
details?: string;
};
/**
* Transcribe audio to text using the internal Speech-to-Text service
*
* @param options - Audio data and metadata
* @returns Transcription result or error
*/
export async function transcribeAudio(
options: TranscribeOptions
): Promise<TranscriptionResponse | TranscriptionError> {
try {
// Step 1: Validate environment configuration
if (!ENV.forgeApiUrl) {
return {
error: "Voice transcription service is not configured",
code: "SERVICE_ERROR",
details: "BUILT_IN_FORGE_API_URL is not set"
};
}
if (!ENV.forgeApiKey) {
return {
error: "Voice transcription service authentication is missing",
code: "SERVICE_ERROR",
details: "BUILT_IN_FORGE_API_KEY is not set"
};
}
// Step 2: Download audio from URL
let audioBuffer: Buffer;
let mimeType: string;
try {
const response = await fetch(options.audioUrl);
if (!response.ok) {
return {
error: "Failed to download audio file",
code: "INVALID_FORMAT",
details: `HTTP ${response.status}: ${response.statusText}`
};
}
audioBuffer = Buffer.from(await response.arrayBuffer());
mimeType = response.headers.get('content-type') || 'audio/mpeg';
// Check file size (16MB limit)
const sizeMB = audioBuffer.length / (1024 * 1024);
if (sizeMB > 16) {
return {
error: "Audio file exceeds maximum size limit",
code: "FILE_TOO_LARGE",
details: `File size is ${sizeMB.toFixed(2)}MB, maximum allowed is 16MB`
};
}
} catch (error) {
return {
error: "Failed to fetch audio file",
code: "SERVICE_ERROR",
details: error instanceof Error ? error.message : "Unknown error"
};
}
// Step 3: Create FormData for multipart upload to Whisper API
const formData = new FormData();
// Create a Blob from the buffer and append to form
const filename = `audio.${getFileExtension(mimeType)}`;
const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
formData.append("file", audioBlob, filename);
formData.append("model", "whisper-1");
formData.append("response_format", "verbose_json");
// Add prompt - use custom prompt if provided, otherwise generate based on language
const prompt = options.prompt || (
options.language
? `Transcribe the user's voice to text, the user's working language is ${getLanguageName(options.language)}`
: "Transcribe the user's voice to text"
);
formData.append("prompt", prompt);
// Step 4: Call the transcription service
const baseUrl = ENV.forgeApiUrl.endsWith("/")
? ENV.forgeApiUrl
: `${ENV.forgeApiUrl}/`;
const fullUrl = new URL(
"v1/audio/transcriptions",
baseUrl
).toString();
const response = await fetch(fullUrl, {
method: "POST",
headers: {
authorization: `Bearer ${ENV.forgeApiKey}`,
"Accept-Encoding": "identity",
},
body: formData,
});
if (!response.ok) {
const errorText = await response.text().catch(() => "");
return {
error: "Transcription service request failed",
code: "TRANSCRIPTION_FAILED",
details: `${response.status} ${response.statusText}${errorText ? `: ${errorText}` : ""}`
};
}
// Step 5: Parse and return the transcription result
const whisperResponse = await response.json() as WhisperResponse;
// Validate response structure
if (!whisperResponse.text || typeof whisperResponse.text !== 'string') {
return {
error: "Invalid transcription response",
code: "SERVICE_ERROR",
details: "Transcription service returned an invalid response format"
};
}
return whisperResponse; // Return native Whisper API response directly
} catch (error) {
// Handle unexpected errors
return {
error: "Voice transcription failed",
code: "SERVICE_ERROR",
details: error instanceof Error ? error.message : "An unexpected error occurred"
};
}
}
/**
* Helper function to get file extension from MIME type
*/
function getFileExtension(mimeType: string): string {
const mimeToExt: Record<string, string> = {
'audio/webm': 'webm',
'audio/mp3': 'mp3',
'audio/mpeg': 'mp3',
'audio/wav': 'wav',
'audio/wave': 'wav',
'audio/ogg': 'ogg',
'audio/m4a': 'm4a',
'audio/mp4': 'm4a',
};
return mimeToExt[mimeType] || 'audio';
}
/**
* Helper function to get full language name from ISO code
*/
function getLanguageName(langCode: string): string {
const langMap: Record<string, string> = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'it': 'Italian',
'pt': 'Portuguese',
'ru': 'Russian',
'ja': 'Japanese',
'ko': 'Korean',
'zh': 'Chinese',
'ar': 'Arabic',
'hi': 'Hindi',
'nl': 'Dutch',
'pl': 'Polish',
'tr': 'Turkish',
'sv': 'Swedish',
'da': 'Danish',
'no': 'Norwegian',
'fi': 'Finnish',
};
return langMap[langCode] || langCode;
}
/**
* Example tRPC procedure implementation:
*
* ```ts
* // In server/routers.ts
* import { transcribeAudio } from "./_core/voiceTranscription";
*
* export const voiceRouter = router({
* transcribe: protectedProcedure
* .input(z.object({
* audioUrl: z.string(),
* language: z.string().optional(),
* prompt: z.string().optional(),
* }))
* .mutation(async ({ input, ctx }) => {
* const result = await transcribeAudio(input);
*
* // Check if it's an error
* if ('error' in result) {
* throw new TRPCError({
* code: 'BAD_REQUEST',
* message: result.error,
* cause: result,
* });
* }
*
* // Optionally save transcription to database
* await db.insert(transcriptions).values({
* userId: ctx.user.id,
* text: result.text,
* duration: result.duration,
* language: result.language,
* audioUrl: input.audioUrl,
* createdAt: new Date(),
* });
*
* return result;
* }),
* });
* ```
*/