θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions collector/utils/extensions/YoutubeTranscript/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const { validYoutubeVideoUrl } = require("../../url");
/**
* Fetch the transcript content for a YouTube video
* @param {string} url - The URL of the YouTube video
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: Object}>} - The transcript content for the YouTube video
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: TranscriptMetadata}>} - The transcript content for the YouTube video
*/
async function fetchVideoTranscriptContent({ url }) {
if (!validYoutubeVideoUrl(url)) {
Expand Down Expand Up @@ -64,11 +64,20 @@ async function fetchVideoTranscriptContent({ url }) {
};
}

/**
* @typedef {Object} TranscriptMetadata
* @property {string} title - The title of the video
* @property {string} author - The author of the video
* @property {string} description - The description of the video
* @property {string} view_count - The view count of the video
* @property {string} source - The source of the video (videoId)
*/

/**
* @typedef {Object} TranscriptAsDocument
* @property {boolean} success - Whether the transcript was successful
* @property {string|null} reason - The reason for the transcript
* @property {{title: string, author: string, destination: string}} data - The data from the transcript
* @property {TranscriptMetadata} metadata - The metadata from the transcript
*/

/**
Expand Down Expand Up @@ -104,11 +113,12 @@ async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
}

const { content, metadata } = transcriptResults;

if (options.parseOnly) {
return {
success: true,
reason: null,
content,
content: buildTranscriptContentWithMetadata(content, metadata),
documents: [],
saveAsDocument: options.parseOnly,
data: {},
Expand Down Expand Up @@ -154,6 +164,33 @@ async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
};
}

/**
* Generate the transcript content and metadata into a single string
*
* Why? For ephemeral documents where we just want the content, we want to include the metadata as keys in the content
* so that the LLM has context about the video, this gives it a better understanding of the video
* and allows it to use the metadata in the conversation if relevant.
* Examples:
* - How many views does <LINK> have?
* - Checkout <LINK> and tell me the key points and if it is performing well
* - Summarize this video <LINK>? -> description could have links and references
* @param {string} content - The content of the transcript
* @param {TranscriptMetadata} metadata - The metadata from the transcript
* @returns {string} - The concatenated transcript content and metadata
*/
function buildTranscriptContentWithMetadata(content = "", metadata = {}) {
const VALID_METADATA_KEYS = ["title", "author", "description", "view_count"];
if (!content || !metadata || Object.keys(metadata).length === 0)
return content;

let contentWithMetadata = "";
VALID_METADATA_KEYS.forEach((key) => {
if (!metadata[key]) return;
contentWithMetadata += `<${key}>${metadata[key]}</${key}>`;
});
return `${contentWithMetadata}\nTranscript:\n${content}`;
}

module.exports = {
loadYouTubeTranscript,
fetchVideoTranscriptContent,
Expand Down