feat: added support for audio timestamp understanding to Google Vertex (

#4061) Co-authored-by: Lars Grammel <[email protected]>
vercel · Dec 17, 2024 · db31e74 · db31e74
1 parent c53ebee
commit db31e74
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 0 deletions.
diff --git a/.changeset/poor-apples-punch.md b/.changeset/poor-apples-punch.md
@@ -0,0 +1,5 @@
+---
+'@ai-sdk/google': patch
+---
+
+feat: adding audioTimestamp support to GoogleGenerativeAISettings
diff --git a/content/providers/01-ai-sdk-providers/11-google-vertex.mdx b/content/providers/01-ai-sdk-providers/11-google-vertex.mdx
@@ -294,6 +294,13 @@ The following optional settings are available for Google Vertex models:
 
   Optional. When enabled, the model will [use Google search to ground the response](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/overview).
 
+- **audioTimestamp** _boolean_
+
+  Optional. Enables timestamp understanding for audio files. Defaults to false.
+
+  This is useful for generating transcripts with accurate timestamps.
+  Consult [Google's Documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/audio-understanding) for usage details.
+
 You can use Google Vertex language models to generate text with the `generateText` function:
 
 ```ts highlight="1,4"

diff --git a/examples/ai-core/src/e2e/google-vertex.test.ts b/examples/ai-core/src/e2e/google-vertex.test.ts
@@ -392,6 +392,36 @@ describe.each(Object.values(RUNTIME_VARIANTS))(
         expect(result.text.toLowerCase()).toContain('cat');
         expect(result.usage?.totalTokens).toBeGreaterThan(0);
       });
+
+      it(
+        'should generate text from audio input',
+        { timeout: LONG_TEST_MILLIS },
+        async () => {
+          const model = vertex(modelId);
+          const result = await generateText({
+            model,
+            messages: [
+              {
+                role: 'user',
+                content: [
+                  {
+                    type: 'text',
+                    text: 'Output a transcript of spoken words. Break up transcript lines when there are pauses. Include timestamps in the format of HH:MM:SS.SSS.',
+                  },
+                  {
+                    type: 'file',
+                    data: Buffer.from(fs.readFileSync('./data/galileo.mp3')),
+                    mimeType: 'audio/mpeg',
+                  },
+                ],
+              },
+            ],
+          });
+          expect(result.text).toBeTruthy();
+          expect(result.text.toLowerCase()).toContain('galileo');
+          expect(result.usage?.totalTokens).toBeGreaterThan(0);
+        },
+      );
     });
 
     describe.each(MODEL_VARIANTS.embedding)('Embedding Model: %s', modelId => {

diff --git a/examples/ai-core/src/generate-text/google-vertex-audio.ts b/examples/ai-core/src/generate-text/google-vertex-audio.ts
@@ -0,0 +1,30 @@
+import { vertex } from '@ai-sdk/google-vertex';
+import { generateText } from 'ai';
+import 'dotenv/config';
+import fs from 'node:fs';
+
+async function main() {
+  const result = await generateText({
+    model: vertex('gemini-1.5-flash', { audioTimestamp: true }),
+    messages: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'text',
+            text: 'Output a transcript of spoken words. Break up transcript lines when there are pauses. Include timestamps in the format of HH:MM:SS.SSS.',
+          },
+          {
+            type: 'file',
+            data: Buffer.from(fs.readFileSync('./data/galileo.mp3')),
+            mimeType: 'audio/mpeg',
+          },
+        ],
+      },
+    ],
+  });
+
+  console.log(result.text);
+}
+
+main().catch(console.error);
diff --git a/packages/google/src/google-generative-ai-language-model.ts b/packages/google/src/google-generative-ai-language-model.ts
@@ -109,6 +109,9 @@ export class GoogleGenerativeAILanguageModel implements LanguageModelV1 {
         this.supportsStructuredOutputs
           ? convertJSONSchemaToOpenAPISchema(responseFormat.schema)
           : undefined,
+      ...(this.settings.audioTimestamp && {
+        audioTimestamp: this.settings.audioTimestamp,
+      }),
     };
 
     const { contents, systemInstruction } =

diff --git a/packages/google/src/google-generative-ai-settings.ts b/packages/google/src/google-generative-ai-settings.ts
@@ -55,6 +55,12 @@ Optional. A list of unique safety settings for blocking unsafe content.
       | 'BLOCK_ONLY_HIGH'
       | 'BLOCK_NONE';
   }>;
+  /**
+   * Optional. Enables timestamp understanding for audio-only files.
+   *
+   * https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/audio-understanding
+   */
+  audioTimestamp?: boolean;
 
   /**
 Optional. When enabled, the model will use Google search to ground the response.