Initial project bootstrap
这个提交包含在:
284
server/_core/voiceTranscription.ts
普通文件
284
server/_core/voiceTranscription.ts
普通文件
@@ -0,0 +1,284 @@
|
||||
/**
|
||||
* Voice transcription helper using internal Speech-to-Text service
|
||||
*
|
||||
* Frontend implementation guide:
|
||||
* 1. Capture audio using MediaRecorder API
|
||||
* 2. Upload audio to storage (e.g., S3) to get URL
|
||||
* 3. Call transcription with the URL
|
||||
*
|
||||
* Example usage:
|
||||
* ```tsx
|
||||
* // Frontend component
|
||||
* const transcribeMutation = trpc.voice.transcribe.useMutation({
|
||||
* onSuccess: (data) => {
|
||||
* console.log(data.text); // Full transcription
|
||||
* console.log(data.language); // Detected language
|
||||
* console.log(data.segments); // Timestamped segments
|
||||
* }
|
||||
* });
|
||||
*
|
||||
* // After uploading audio to storage
|
||||
* transcribeMutation.mutate({
|
||||
* audioUrl: uploadedAudioUrl,
|
||||
* language: 'en', // optional
|
||||
* prompt: 'Transcribe the meeting' // optional
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
import { ENV } from "./env";
|
||||
|
||||
export type TranscribeOptions = {
|
||||
audioUrl: string; // URL to the audio file (e.g., S3 URL)
|
||||
language?: string; // Optional: specify language code (e.g., "en", "es", "zh")
|
||||
prompt?: string; // Optional: custom prompt for the transcription
|
||||
};
|
||||
|
||||
// Native Whisper API segment format
|
||||
export type WhisperSegment = {
|
||||
id: number;
|
||||
seek: number;
|
||||
start: number;
|
||||
end: number;
|
||||
text: string;
|
||||
tokens: number[];
|
||||
temperature: number;
|
||||
avg_logprob: number;
|
||||
compression_ratio: number;
|
||||
no_speech_prob: number;
|
||||
};
|
||||
|
||||
// Native Whisper API response format
|
||||
export type WhisperResponse = {
|
||||
task: "transcribe";
|
||||
language: string;
|
||||
duration: number;
|
||||
text: string;
|
||||
segments: WhisperSegment[];
|
||||
};
|
||||
|
||||
export type TranscriptionResponse = WhisperResponse; // Return native Whisper API response directly
|
||||
|
||||
export type TranscriptionError = {
|
||||
error: string;
|
||||
code: "FILE_TOO_LARGE" | "INVALID_FORMAT" | "TRANSCRIPTION_FAILED" | "UPLOAD_FAILED" | "SERVICE_ERROR";
|
||||
details?: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Transcribe audio to text using the internal Speech-to-Text service
|
||||
*
|
||||
* @param options - Audio data and metadata
|
||||
* @returns Transcription result or error
|
||||
*/
|
||||
export async function transcribeAudio(
|
||||
options: TranscribeOptions
|
||||
): Promise<TranscriptionResponse | TranscriptionError> {
|
||||
try {
|
||||
// Step 1: Validate environment configuration
|
||||
if (!ENV.forgeApiUrl) {
|
||||
return {
|
||||
error: "Voice transcription service is not configured",
|
||||
code: "SERVICE_ERROR",
|
||||
details: "BUILT_IN_FORGE_API_URL is not set"
|
||||
};
|
||||
}
|
||||
if (!ENV.forgeApiKey) {
|
||||
return {
|
||||
error: "Voice transcription service authentication is missing",
|
||||
code: "SERVICE_ERROR",
|
||||
details: "BUILT_IN_FORGE_API_KEY is not set"
|
||||
};
|
||||
}
|
||||
|
||||
// Step 2: Download audio from URL
|
||||
let audioBuffer: Buffer;
|
||||
let mimeType: string;
|
||||
try {
|
||||
const response = await fetch(options.audioUrl);
|
||||
if (!response.ok) {
|
||||
return {
|
||||
error: "Failed to download audio file",
|
||||
code: "INVALID_FORMAT",
|
||||
details: `HTTP ${response.status}: ${response.statusText}`
|
||||
};
|
||||
}
|
||||
|
||||
audioBuffer = Buffer.from(await response.arrayBuffer());
|
||||
mimeType = response.headers.get('content-type') || 'audio/mpeg';
|
||||
|
||||
// Check file size (16MB limit)
|
||||
const sizeMB = audioBuffer.length / (1024 * 1024);
|
||||
if (sizeMB > 16) {
|
||||
return {
|
||||
error: "Audio file exceeds maximum size limit",
|
||||
code: "FILE_TOO_LARGE",
|
||||
details: `File size is ${sizeMB.toFixed(2)}MB, maximum allowed is 16MB`
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
error: "Failed to fetch audio file",
|
||||
code: "SERVICE_ERROR",
|
||||
details: error instanceof Error ? error.message : "Unknown error"
|
||||
};
|
||||
}
|
||||
|
||||
// Step 3: Create FormData for multipart upload to Whisper API
|
||||
const formData = new FormData();
|
||||
|
||||
// Create a Blob from the buffer and append to form
|
||||
const filename = `audio.${getFileExtension(mimeType)}`;
|
||||
const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
|
||||
formData.append("file", audioBlob, filename);
|
||||
|
||||
formData.append("model", "whisper-1");
|
||||
formData.append("response_format", "verbose_json");
|
||||
|
||||
// Add prompt - use custom prompt if provided, otherwise generate based on language
|
||||
const prompt = options.prompt || (
|
||||
options.language
|
||||
? `Transcribe the user's voice to text, the user's working language is ${getLanguageName(options.language)}`
|
||||
: "Transcribe the user's voice to text"
|
||||
);
|
||||
formData.append("prompt", prompt);
|
||||
|
||||
// Step 4: Call the transcription service
|
||||
const baseUrl = ENV.forgeApiUrl.endsWith("/")
|
||||
? ENV.forgeApiUrl
|
||||
: `${ENV.forgeApiUrl}/`;
|
||||
|
||||
const fullUrl = new URL(
|
||||
"v1/audio/transcriptions",
|
||||
baseUrl
|
||||
).toString();
|
||||
|
||||
const response = await fetch(fullUrl, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
authorization: `Bearer ${ENV.forgeApiKey}`,
|
||||
"Accept-Encoding": "identity",
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text().catch(() => "");
|
||||
return {
|
||||
error: "Transcription service request failed",
|
||||
code: "TRANSCRIPTION_FAILED",
|
||||
details: `${response.status} ${response.statusText}${errorText ? `: ${errorText}` : ""}`
|
||||
};
|
||||
}
|
||||
|
||||
// Step 5: Parse and return the transcription result
|
||||
const whisperResponse = await response.json() as WhisperResponse;
|
||||
|
||||
// Validate response structure
|
||||
if (!whisperResponse.text || typeof whisperResponse.text !== 'string') {
|
||||
return {
|
||||
error: "Invalid transcription response",
|
||||
code: "SERVICE_ERROR",
|
||||
details: "Transcription service returned an invalid response format"
|
||||
};
|
||||
}
|
||||
|
||||
return whisperResponse; // Return native Whisper API response directly
|
||||
|
||||
} catch (error) {
|
||||
// Handle unexpected errors
|
||||
return {
|
||||
error: "Voice transcription failed",
|
||||
code: "SERVICE_ERROR",
|
||||
details: error instanceof Error ? error.message : "An unexpected error occurred"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to get file extension from MIME type
|
||||
*/
|
||||
function getFileExtension(mimeType: string): string {
|
||||
const mimeToExt: Record<string, string> = {
|
||||
'audio/webm': 'webm',
|
||||
'audio/mp3': 'mp3',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/wav': 'wav',
|
||||
'audio/wave': 'wav',
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/m4a': 'm4a',
|
||||
'audio/mp4': 'm4a',
|
||||
};
|
||||
|
||||
return mimeToExt[mimeType] || 'audio';
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to get full language name from ISO code
|
||||
*/
|
||||
function getLanguageName(langCode: string): string {
|
||||
const langMap: Record<string, string> = {
|
||||
'en': 'English',
|
||||
'es': 'Spanish',
|
||||
'fr': 'French',
|
||||
'de': 'German',
|
||||
'it': 'Italian',
|
||||
'pt': 'Portuguese',
|
||||
'ru': 'Russian',
|
||||
'ja': 'Japanese',
|
||||
'ko': 'Korean',
|
||||
'zh': 'Chinese',
|
||||
'ar': 'Arabic',
|
||||
'hi': 'Hindi',
|
||||
'nl': 'Dutch',
|
||||
'pl': 'Polish',
|
||||
'tr': 'Turkish',
|
||||
'sv': 'Swedish',
|
||||
'da': 'Danish',
|
||||
'no': 'Norwegian',
|
||||
'fi': 'Finnish',
|
||||
};
|
||||
|
||||
return langMap[langCode] || langCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Example tRPC procedure implementation:
|
||||
*
|
||||
* ```ts
|
||||
* // In server/routers.ts
|
||||
* import { transcribeAudio } from "./_core/voiceTranscription";
|
||||
*
|
||||
* export const voiceRouter = router({
|
||||
* transcribe: protectedProcedure
|
||||
* .input(z.object({
|
||||
* audioUrl: z.string(),
|
||||
* language: z.string().optional(),
|
||||
* prompt: z.string().optional(),
|
||||
* }))
|
||||
* .mutation(async ({ input, ctx }) => {
|
||||
* const result = await transcribeAudio(input);
|
||||
*
|
||||
* // Check if it's an error
|
||||
* if ('error' in result) {
|
||||
* throw new TRPCError({
|
||||
* code: 'BAD_REQUEST',
|
||||
* message: result.error,
|
||||
* cause: result,
|
||||
* });
|
||||
* }
|
||||
*
|
||||
* // Optionally save transcription to database
|
||||
* await db.insert(transcriptions).values({
|
||||
* userId: ctx.user.id,
|
||||
* text: result.text,
|
||||
* duration: result.duration,
|
||||
* language: result.language,
|
||||
* audioUrl: input.audioUrl,
|
||||
* createdAt: new Date(),
|
||||
* });
|
||||
*
|
||||
* return result;
|
||||
* }),
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
在新工单中引用
屏蔽一个用户