whisper-large-v3-turbo Beta
Automatic Speech Recognition • OpenAIWhisper is a pre-trained model for automatic speech recognition (ASR) and speech translation.
Usage
Workers - TypeScript
export interface Env { AI: Ai;}
export default { async fetch(request, env): Promise<Response> { const res = await fetch( "https://github.com/Azure-Samples/cognitive-services-speech-sdk/raw/master/samples/cpp/windows/console/samples/enrollment_audio_katie.wav" ); const blob = await res.arrayBuffer();
const input = { audio: [...new Uint8Array(blob)], };
const response = await env.AI.run( "@cf/openai/whisper-large-v3-turbo", input );
return Response.json({ input: { audio: [] }, response }); },} satisfies ExportedHandler<Env>;
curl
curl https://api.cloudflare.com/client/v4/accounts/$CLOUDFLARE_ACCOUNT_ID/ai/run/@cf/openai/whisper-large-v3-turbo \ -X POST \ -H "Authorization: Bearer $CLOUDFLARE_API_TOKEN" \ --data-binary "@talking-llama.mp3"
Parameters
* indicates a required field
Input
-
audio *
stringBase64 encoded value of the audio data.
-
task
string default transcribeSupported tasks are 'translate' or 'transcribe'.
-
language
string default enThe language of the audio being transcribed or translated.
-
vad_filter
string default falsePreprocess the audio with a voice activity detection model.
-
initial_prompt
stringA text prompt to help provide context to the model on the contents of the audio.
-
prefix
stringThe prefix it appended the the beginning of the output of the transcription and can guide the transcription result.
Output
-
transcription_info
object-
language
stringThe language of the audio being transcribed or translated.
-
language_probability
numberThe confidence level or probability of the detected language being accurate, represented as a decimal between 0 and 1.
-
duration
numberThe total duration of the original audio file, in seconds.
-
duration_after_vad
numberThe duration of the audio after applying Voice Activity Detection (VAD) to remove silent or irrelevant sections, in seconds.
-
-
text *
stringThe complete transcription of the audio.
-
word_count
numberThe total number of words in the transcription.
-
segments
object-
start
numberThe starting time of the segment within the audio, in seconds.
-
end
numberThe ending time of the segment within the audio, in seconds.
-
text
stringThe transcription of the segment.
-
temperature
numberThe temperature used in the decoding process, controlling randomness in predictions. Lower values result in more deterministic outputs.
-
avg_logprob
numberThe average log probability of the predictions for the words in this segment, indicating overall confidence.
-
compression_ratio
numberThe compression ratio of the input to the output, measuring how much the text was compressed during the transcription process.
-
no_speech_prob
numberThe probability that the segment contains no speech, represented as a decimal between 0 and 1.
-
words
array-
items
object-
word
stringThe individual word transcribed from the audio.
-
start
numberThe starting time of the word within the audio, in seconds.
-
end
numberThe ending time of the word within the audio, in seconds.
-
-
-
-
vtt
stringThe transcription in WebVTT format, which includes timing and text information for use in subtitles.
API Schemas
The following schemas are based on JSON Schema
{ "type": "object", "properties": { "audio": { "type": "string", "description": "Base64 encoded value of the audio data." }, "task": { "type": "string", "default": "transcribe", "description": "Supported tasks are 'translate' or 'transcribe'." }, "language": { "type": "string", "default": "en", "description": "The language of the audio being transcribed or translated." }, "vad_filter": { "type": "string", "default": "false", "description": "Preprocess the audio with a voice activity detection model." }, "initial_prompt": { "type": "string", "description": "A text prompt to help provide context to the model on the contents of the audio." }, "prefix": { "type": "string", "description": "The prefix it appended the the beginning of the output of the transcription and can guide the transcription result." } }, "required": [ "audio" ]}
{ "type": "object", "contentType": "application/json", "properties": { "transcription_info": { "type": "object", "properties": { "language": { "type": "string", "description": "The language of the audio being transcribed or translated." }, "language_probability": { "type": "number", "description": "The confidence level or probability of the detected language being accurate, represented as a decimal between 0 and 1." }, "duration": { "type": "number", "description": "The total duration of the original audio file, in seconds." }, "duration_after_vad": { "type": "number", "description": "The duration of the audio after applying Voice Activity Detection (VAD) to remove silent or irrelevant sections, in seconds." } } }, "text": { "type": "string", "description": "The complete transcription of the audio." }, "word_count": { "type": "number", "description": "The total number of words in the transcription." }, "segments": { "type": "object", "properties": { "start": { "type": "number", "description": "The starting time of the segment within the audio, in seconds." }, "end": { "type": "number", "description": "The ending time of the segment within the audio, in seconds." }, "text": { "type": "string", "description": "The transcription of the segment." }, "temperature": { "type": "number", "description": "The temperature used in the decoding process, controlling randomness in predictions. Lower values result in more deterministic outputs." }, "avg_logprob": { "type": "number", "description": "The average log probability of the predictions for the words in this segment, indicating overall confidence." }, "compression_ratio": { "type": "number", "description": "The compression ratio of the input to the output, measuring how much the text was compressed during the transcription process." }, "no_speech_prob": { "type": "number", "description": "The probability that the segment contains no speech, represented as a decimal between 0 and 1." }, "words": { "type": "array", "items": { "type": "object", "properties": { "word": { "type": "string", "description": "The individual word transcribed from the audio." }, "start": { "type": "number", "description": "The starting time of the word within the audio, in seconds." }, "end": { "type": "number", "description": "The ending time of the word within the audio, in seconds." } } } } } }, "vtt": { "type": "string", "description": "The transcription in WebVTT format, which includes timing and text information for use in subtitles." } }, "required": [ "text" ]}