diff --git a/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js b/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js index 1fca742fd53..31deba3882e 100644 --- a/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js +++ b/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js @@ -1,16 +1,32 @@ const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js"); describe("YoutubeTranscript", () => { - it("should fetch transcript from YouTube video", async () => { - const videoId = "BJjsfNO5JTo"; - const transcript = await YoutubeTranscript.fetchTranscript(videoId, { - lang: "en", - }); + if (process.env.GITHUB_ACTIONS) { + console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve."); + it('is stubbed in GitHub Actions', () => expect(true).toBe(true)); + } else { + it("should fetch transcript from YouTube video", async () => { + const videoId = "BJjsfNO5JTo"; + const transcript = await YoutubeTranscript.fetchTranscript(videoId, { + lang: "en", + }); - expect(transcript).toBeDefined(); - expect(typeof transcript).toBe("string"); - expect(transcript.length).toBeGreaterThan(0); - // console.log("Success! Transcript length:", transcript.length); - // console.log("First 200 characters:", transcript.substring(0, 200) + "..."); - }, 30000); + expect(transcript).toBeDefined(); + expect(typeof transcript).toBe("string"); + expect(transcript.length).toBeGreaterThan(0); + console.log("First 200 characters:", transcript.substring(0, 200) + "..."); + }, 30000); + + it("should fetch non asr transcript from YouTube video", async () => { + const videoId = "D111ao6wWH0"; + const transcript = await YoutubeTranscript.fetchTranscript(videoId, { + lang: "zh-HK", + }); + + expect(transcript).toBeDefined(); + expect(typeof transcript).toBe("string"); + expect(transcript.length).toBeGreaterThan(0); + console.log("First 200 characters:", transcript.substring(0, 200) + "..."); + }, 30000); + } }); diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js index 3f0a4c4371f..4807a0acf1b 100644 --- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js +++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js @@ -85,6 +85,85 @@ class YoutubeTranscript { .replace(/\s+/g, " "); } + /** + * Calculates a preference score for a caption track to determine the best match + * @param {Object} track - The caption track object from YouTube + * @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es') + * @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed) + * @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en']) + * @returns {number} Preference score (lower is better) + */ + static #calculatePreferenceScore(track, preferredLanguages) { + // Language preference: index in preferredLanguages array (0 = most preferred) + const languagePreference = preferredLanguages.indexOf(track.languageCode); + const languageScore = languagePreference === -1 ? 9999 : languagePreference; + + // Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr') + const kindBonus = track.kind === "asr" ? 0.5 : 0; + + return languageScore + kindBonus; + } + + /** + * Finds the most suitable caption track based on preferred languages + * @param {string} videoBody - The raw HTML response from YouTube + * @param {string[]} preferredLanguages - Array of language codes in preference order + * @returns {Object|null} The selected caption track or null if none found + */ + static #findPreferredCaptionTrack(videoBody, preferredLanguages) { + const captionsConfigJson = videoBody.match( + /"captions":(.*?),"videoDetails":/s + ); + + const captionsConfig = captionsConfigJson?.[1] + ? JSON.parse(captionsConfigJson[1]) + : null; + + const captionTracks = captionsConfig + ? captionsConfig.playerCaptionsTracklistRenderer.captionTracks + : null; + + if (!captionTracks || captionTracks.length === 0) { + return null; + } + + const sortedTracks = [...captionTracks].sort((a, b) => { + const scoreA = this.#calculatePreferenceScore(a, preferredLanguages); + const scoreB = this.#calculatePreferenceScore(b, preferredLanguages); + return scoreA - scoreB; + }); + + return sortedTracks[0]; + } + + /** + * Fetches video page content and finds the preferred caption track + * @param {string} videoId - YouTube video ID + * @param {string[]} preferredLanguages - Array of preferred language codes + * @returns {Promise} The preferred caption track + * @throws {YoutubeTranscriptError} If no suitable caption track is found + */ + static async #getPreferredCaptionTrack(videoId, preferredLanguages) { + const videoResponse = await fetch( + `https://www.youtube.com/watch?v=${videoId}`, + { credentials: "omit" } + ); + const videoBody = await videoResponse.text(); + + const preferredCaptionTrack = this.#findPreferredCaptionTrack( + videoBody, + preferredLanguages + ); + + if (!preferredCaptionTrack) { + throw new YoutubeTranscriptError( + "No suitable caption track found for the video" + ); + } + + return preferredCaptionTrack; + } + /** * Fetch transcript from YouTube video * @param {string} videoId - Video URL or video identifier @@ -93,14 +172,20 @@ class YoutubeTranscript { * @returns {Promise} Video transcript text */ static async fetchTranscript(videoId, config = {}) { + const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"]; const identifier = this.retrieveVideoId(videoId); - const lang = config?.lang ?? "en"; try { + const preferredCaptionTrack = await this.#getPreferredCaptionTrack( + identifier, + preferredLanguages + ); + const innerProto = this.#getBase64Protobuf({ - param1: "asr", - param2: lang, + param1: preferredCaptionTrack.kind || "", + param2: preferredCaptionTrack.languageCode, }); + const params = this.#getBase64Protobuf({ param1: identifier, param2: innerProto,