θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");

describe("YoutubeTranscript", () => {
it("should fetch transcript from YouTube video", async () => {
const videoId = "BJjsfNO5JTo";
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
lang: "en",
});

expect(transcript).toBeDefined();
expect(typeof transcript).toBe("string");
expect(transcript.length).toBeGreaterThan(0);

// Log the results for debugging purposes
console.log("Success! Transcript length:", transcript.length);
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
}, 30000); // 30 second timeout for network request
});
Original file line number Diff line number Diff line change
@@ -1,110 +1,156 @@
const { parse } = require("node-html-parser");
const RE_YOUTUBE =
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
const USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";

class YoutubeTranscriptError extends Error {
constructor(message) {
super(`[YoutubeTranscript] ${message}`);
}
}

/**
* Class to retrieve transcript if exist
* Handles fetching and parsing YouTube video transcripts
*/
class YoutubeTranscript {
/**
* Fetch transcript from YTB Video
* @param videoId Video url or video identifier
* @param config Object with lang param (eg: en, es, hk, uk) format.
* Will just the grab first caption if it can find one, so no special lang caption support.
* Encodes a string as a protobuf field
* @param {number} fieldNumber - The protobuf field number
* @param {string} str - The string to encode
* @returns {Buffer} Encoded protobuf field
*/
static #encodeProtobufString(fieldNumber, str) {
const utf8Bytes = Buffer.from(str, "utf8");
const tag = (fieldNumber << 3) | 2; // wire type 2 for string
const lengthBytes = this.#encodeVarint(utf8Bytes.length);

return Buffer.concat([
Buffer.from([tag]),
Buffer.from(lengthBytes),
utf8Bytes,
]);
}

/**
* Encodes a number as a protobuf varint
* @param {number} value - The number to encode
* @returns {number[]} Encoded varint bytes
*/
static #encodeVarint(value) {
const bytes = [];
while (value >= 0x80) {
bytes.push((value & 0x7f) | 0x80);
value >>>= 7;
}
bytes.push(value);
return bytes;
}

/**
* Creates a base64 encoded protobuf message
* @param {Object} param - The parameters to encode
* @param {string} param.param1 - First parameter
* @param {string} param.param2 - Second parameter
* @returns {string} Base64 encoded protobuf
*/
static #getBase64Protobuf({ param1, param2 }) {
const field1 = this.#encodeProtobufString(1, param1);
const field2 = this.#encodeProtobufString(2, param2);
return Buffer.concat([field1, field2]).toString("base64");
}

/**
* Extracts transcript text from YouTube API response
* @param {Object} responseData - The YouTube API response
* @returns {string} Combined transcript text
*/
static #extractTranscriptFromResponse(responseData) {
const transcriptRenderer =
responseData.actions?.[0]?.updateEngagementPanelAction?.content
?.transcriptRenderer;
if (!transcriptRenderer) {
throw new Error("No transcript data found in response");
}

const segments =
transcriptRenderer.content?.transcriptSearchPanelRenderer?.body
?.transcriptSegmentListRenderer?.initialSegments;
if (!segments) {
throw new Error("Transcript segments not found in response");
}

return segments
.map((segment) => {
const runs = segment.transcriptSegmentRenderer?.snippet?.runs;
return runs ? runs.map((run) => run.text).join("") : "";
})
.filter((text) => text)
.join(" ")
.trim()
.replace(/\s+/g, " ");
}

/**
* Fetch transcript from YouTube video
* @param {string} videoId - Video URL or video identifier
* @param {Object} config - Configuration options
* @param {string} [config.lang='en'] - Language code (e.g., 'en', 'es', 'fr')
* @returns {Promise<string>} Video transcript text
*/
static async fetchTranscript(videoId, config = {}) {
const identifier = this.retrieveVideoId(videoId);
const lang = config?.lang ?? "en";

try {
const transcriptUrl = await fetch(
`https://www.youtube.com/watch?v=${identifier}`,
const innerProto = this.#getBase64Protobuf({
param1: "asr",
param2: lang,
});
const params = this.#getBase64Protobuf({
param1: identifier,
param2: innerProto,
});

const response = await fetch(
"https://www.youtube.com/youtubei/v1/get_transcript",
{
method: "POST",
headers: {
"User-Agent": USER_AGENT,
"Content-Type": "application/json",
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
},
body: JSON.stringify({
context: {
client: {
clientName: "WEB",
clientVersion: "2.20240826.01.00",
},
},
params,
}),
}
)
.then((res) => res.text())
.then((html) => parse(html))
.then((html) => this.#parseTranscriptEndpoint(html, lang));

if (!transcriptUrl)
throw new Error("Failed to locate a transcript for this video!");

// Result is hopefully some XML.
const transcriptXML = await fetch(transcriptUrl)
.then((res) => res.text())
.then((xml) => parse(xml));

let transcript = "";
const chunks = transcriptXML.getElementsByTagName("text");
for (const chunk of chunks) {
// Add space after each text chunk
transcript += chunk.textContent + " ";
}

// Trim extra whitespace
return transcript.trim().replace(/\s+/g, " ");
} catch (e) {
throw new YoutubeTranscriptError(e);
}
}

static #parseTranscriptEndpoint(document, langCode = null) {
try {
// Get all script tags on document page
const scripts = document.getElementsByTagName("script");

// find the player data script.
const playerScript = scripts.find((script) =>
script.textContent.includes("var ytInitialPlayerResponse = {")
);

const dataString =
playerScript.textContent
?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
?.split("};")?.[0] + // chunk off any code after object closure.
"}"; // add back that curly brace we just cut.

const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
const availableCaptions =
data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];

// If languageCode was specified then search for it's code, otherwise get the first.
let captionTrack = availableCaptions?.[0];
if (langCode)
captionTrack =
availableCaptions.find((track) =>
track.languageCode.includes(langCode)
) ?? availableCaptions?.[0];

return captionTrack?.baseUrl;
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}

const responseData = await response.json();
return this.#extractTranscriptFromResponse(responseData);
} catch (e) {
console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
return null;
throw new YoutubeTranscriptError(e.message || e);
}
}

/**
* Retrieve video id from url or string
* @param videoId video url or video id
* Extract video ID from a YouTube URL or verify an existing ID
* @param {string} videoId - Video URL or ID
* @returns {string} YouTube video ID
*/
static retrieveVideoId(videoId) {
if (videoId.length === 11) {
return videoId;
}
if (videoId.length === 11) return videoId;

const RE_YOUTUBE =
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
const matchId = videoId.match(RE_YOUTUBE);
if (matchId && matchId.length) {
return matchId[1];
}

if (matchId?.[1]) return matchId[1];
throw new YoutubeTranscriptError(
"Impossible to retrieve Youtube video ID."
);
Expand Down