285 行
7.9 KiB
TypeScript
285 行
7.9 KiB
TypeScript
/**
|
|
* Voice transcription helper using internal Speech-to-Text service
|
|
*
|
|
* Frontend implementation guide:
|
|
* 1. Capture audio using MediaRecorder API
|
|
* 2. Upload audio to storage (e.g., S3) to get URL
|
|
* 3. Call transcription with the URL
|
|
*
|
|
* Example usage:
|
|
* ```tsx
|
|
* // Frontend component
|
|
* const transcribeMutation = trpc.voice.transcribe.useMutation({
|
|
* onSuccess: (data) => {
|
|
* console.log(data.text); // Full transcription
|
|
* console.log(data.language); // Detected language
|
|
* console.log(data.segments); // Timestamped segments
|
|
* }
|
|
* });
|
|
*
|
|
* // After uploading audio to storage
|
|
* transcribeMutation.mutate({
|
|
* audioUrl: uploadedAudioUrl,
|
|
* language: 'en', // optional
|
|
* prompt: 'Transcribe the meeting' // optional
|
|
* });
|
|
* ```
|
|
*/
|
|
import { ENV } from "./env";
|
|
|
|
export type TranscribeOptions = {
|
|
audioUrl: string; // URL to the audio file (e.g., S3 URL)
|
|
language?: string; // Optional: specify language code (e.g., "en", "es", "zh")
|
|
prompt?: string; // Optional: custom prompt for the transcription
|
|
};
|
|
|
|
// Native Whisper API segment format
|
|
export type WhisperSegment = {
|
|
id: number;
|
|
seek: number;
|
|
start: number;
|
|
end: number;
|
|
text: string;
|
|
tokens: number[];
|
|
temperature: number;
|
|
avg_logprob: number;
|
|
compression_ratio: number;
|
|
no_speech_prob: number;
|
|
};
|
|
|
|
// Native Whisper API response format
|
|
export type WhisperResponse = {
|
|
task: "transcribe";
|
|
language: string;
|
|
duration: number;
|
|
text: string;
|
|
segments: WhisperSegment[];
|
|
};
|
|
|
|
export type TranscriptionResponse = WhisperResponse; // Return native Whisper API response directly
|
|
|
|
export type TranscriptionError = {
|
|
error: string;
|
|
code: "FILE_TOO_LARGE" | "INVALID_FORMAT" | "TRANSCRIPTION_FAILED" | "UPLOAD_FAILED" | "SERVICE_ERROR";
|
|
details?: string;
|
|
};
|
|
|
|
/**
|
|
* Transcribe audio to text using the internal Speech-to-Text service
|
|
*
|
|
* @param options - Audio data and metadata
|
|
* @returns Transcription result or error
|
|
*/
|
|
export async function transcribeAudio(
|
|
options: TranscribeOptions
|
|
): Promise<TranscriptionResponse | TranscriptionError> {
|
|
try {
|
|
// Step 1: Validate environment configuration
|
|
if (!ENV.forgeApiUrl) {
|
|
return {
|
|
error: "Voice transcription service is not configured",
|
|
code: "SERVICE_ERROR",
|
|
details: "BUILT_IN_FORGE_API_URL is not set"
|
|
};
|
|
}
|
|
if (!ENV.forgeApiKey) {
|
|
return {
|
|
error: "Voice transcription service authentication is missing",
|
|
code: "SERVICE_ERROR",
|
|
details: "BUILT_IN_FORGE_API_KEY is not set"
|
|
};
|
|
}
|
|
|
|
// Step 2: Download audio from URL
|
|
let audioBuffer: Buffer;
|
|
let mimeType: string;
|
|
try {
|
|
const response = await fetch(options.audioUrl);
|
|
if (!response.ok) {
|
|
return {
|
|
error: "Failed to download audio file",
|
|
code: "INVALID_FORMAT",
|
|
details: `HTTP ${response.status}: ${response.statusText}`
|
|
};
|
|
}
|
|
|
|
audioBuffer = Buffer.from(await response.arrayBuffer());
|
|
mimeType = response.headers.get('content-type') || 'audio/mpeg';
|
|
|
|
// Check file size (16MB limit)
|
|
const sizeMB = audioBuffer.length / (1024 * 1024);
|
|
if (sizeMB > 16) {
|
|
return {
|
|
error: "Audio file exceeds maximum size limit",
|
|
code: "FILE_TOO_LARGE",
|
|
details: `File size is ${sizeMB.toFixed(2)}MB, maximum allowed is 16MB`
|
|
};
|
|
}
|
|
} catch (error) {
|
|
return {
|
|
error: "Failed to fetch audio file",
|
|
code: "SERVICE_ERROR",
|
|
details: error instanceof Error ? error.message : "Unknown error"
|
|
};
|
|
}
|
|
|
|
// Step 3: Create FormData for multipart upload to Whisper API
|
|
const formData = new FormData();
|
|
|
|
// Create a Blob from the buffer and append to form
|
|
const filename = `audio.${getFileExtension(mimeType)}`;
|
|
const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
|
|
formData.append("file", audioBlob, filename);
|
|
|
|
formData.append("model", "whisper-1");
|
|
formData.append("response_format", "verbose_json");
|
|
|
|
// Add prompt - use custom prompt if provided, otherwise generate based on language
|
|
const prompt = options.prompt || (
|
|
options.language
|
|
? `Transcribe the user's voice to text, the user's working language is ${getLanguageName(options.language)}`
|
|
: "Transcribe the user's voice to text"
|
|
);
|
|
formData.append("prompt", prompt);
|
|
|
|
// Step 4: Call the transcription service
|
|
const baseUrl = ENV.forgeApiUrl.endsWith("/")
|
|
? ENV.forgeApiUrl
|
|
: `${ENV.forgeApiUrl}/`;
|
|
|
|
const fullUrl = new URL(
|
|
"v1/audio/transcriptions",
|
|
baseUrl
|
|
).toString();
|
|
|
|
const response = await fetch(fullUrl, {
|
|
method: "POST",
|
|
headers: {
|
|
authorization: `Bearer ${ENV.forgeApiKey}`,
|
|
"Accept-Encoding": "identity",
|
|
},
|
|
body: formData,
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const errorText = await response.text().catch(() => "");
|
|
return {
|
|
error: "Transcription service request failed",
|
|
code: "TRANSCRIPTION_FAILED",
|
|
details: `${response.status} ${response.statusText}${errorText ? `: ${errorText}` : ""}`
|
|
};
|
|
}
|
|
|
|
// Step 5: Parse and return the transcription result
|
|
const whisperResponse = await response.json() as WhisperResponse;
|
|
|
|
// Validate response structure
|
|
if (!whisperResponse.text || typeof whisperResponse.text !== 'string') {
|
|
return {
|
|
error: "Invalid transcription response",
|
|
code: "SERVICE_ERROR",
|
|
details: "Transcription service returned an invalid response format"
|
|
};
|
|
}
|
|
|
|
return whisperResponse; // Return native Whisper API response directly
|
|
|
|
} catch (error) {
|
|
// Handle unexpected errors
|
|
return {
|
|
error: "Voice transcription failed",
|
|
code: "SERVICE_ERROR",
|
|
details: error instanceof Error ? error.message : "An unexpected error occurred"
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Helper function to get file extension from MIME type
|
|
*/
|
|
function getFileExtension(mimeType: string): string {
|
|
const mimeToExt: Record<string, string> = {
|
|
'audio/webm': 'webm',
|
|
'audio/mp3': 'mp3',
|
|
'audio/mpeg': 'mp3',
|
|
'audio/wav': 'wav',
|
|
'audio/wave': 'wav',
|
|
'audio/ogg': 'ogg',
|
|
'audio/m4a': 'm4a',
|
|
'audio/mp4': 'm4a',
|
|
};
|
|
|
|
return mimeToExt[mimeType] || 'audio';
|
|
}
|
|
|
|
/**
|
|
* Helper function to get full language name from ISO code
|
|
*/
|
|
function getLanguageName(langCode: string): string {
|
|
const langMap: Record<string, string> = {
|
|
'en': 'English',
|
|
'es': 'Spanish',
|
|
'fr': 'French',
|
|
'de': 'German',
|
|
'it': 'Italian',
|
|
'pt': 'Portuguese',
|
|
'ru': 'Russian',
|
|
'ja': 'Japanese',
|
|
'ko': 'Korean',
|
|
'zh': 'Chinese',
|
|
'ar': 'Arabic',
|
|
'hi': 'Hindi',
|
|
'nl': 'Dutch',
|
|
'pl': 'Polish',
|
|
'tr': 'Turkish',
|
|
'sv': 'Swedish',
|
|
'da': 'Danish',
|
|
'no': 'Norwegian',
|
|
'fi': 'Finnish',
|
|
};
|
|
|
|
return langMap[langCode] || langCode;
|
|
}
|
|
|
|
/**
|
|
* Example tRPC procedure implementation:
|
|
*
|
|
* ```ts
|
|
* // In server/routers.ts
|
|
* import { transcribeAudio } from "./_core/voiceTranscription";
|
|
*
|
|
* export const voiceRouter = router({
|
|
* transcribe: protectedProcedure
|
|
* .input(z.object({
|
|
* audioUrl: z.string(),
|
|
* language: z.string().optional(),
|
|
* prompt: z.string().optional(),
|
|
* }))
|
|
* .mutation(async ({ input, ctx }) => {
|
|
* const result = await transcribeAudio(input);
|
|
*
|
|
* // Check if it's an error
|
|
* if ('error' in result) {
|
|
* throw new TRPCError({
|
|
* code: 'BAD_REQUEST',
|
|
* message: result.error,
|
|
* cause: result,
|
|
* });
|
|
* }
|
|
*
|
|
* // Optionally save transcription to database
|
|
* await db.insert(transcriptions).values({
|
|
* userId: ctx.user.id,
|
|
* text: result.text,
|
|
* duration: result.duration,
|
|
* language: result.language,
|
|
* audioUrl: input.audioUrl,
|
|
* createdAt: new Date(),
|
|
* });
|
|
*
|
|
* return result;
|
|
* }),
|
|
* });
|
|
* ```
|
|
*/
|