θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"node-html-parser": "^6.1.13",
"officeparser": "^4.0.5",
"openai": "4.38.5",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "3.4.120",
"puppeteer": "~21.5.2",
"slugify": "^1.6.6",
"url-pattern": "^1.0.3",
Expand All @@ -49,4 +49,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
1 change: 0 additions & 1 deletion collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
let pageContent = [];
const docs = await loader.load();
for (const doc of docs) {
console.log(doc.metadata);
console.log(`-- Parsing content from docx page --`);
if (!doc.pageContent.length) continue;
pageContent.push(doc.pageContent);
Expand Down
38 changes: 21 additions & 17 deletions collector/processSingleFile/convert/asPDF.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
const { v4 } = require("uuid");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const {
createdDate,
trashFile,
Expand All @@ -9,21 +8,24 @@ const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function asPDF({ fullFilePath = "", filename = "" }) {
const pdfLoader = new PDFLoader(fullFilePath, {
splitPages: true,
});

const pdfjsLib = await import("pdfjs-dist");
console.log(`-- Working ${filename} --`);

const loadingTask = pdfjsLib.default.getDocument(fullFilePath);
const pdf = await loadingTask.promise;

const numPages = pdf.numPages;
const pageContent = [];
const docs = await pdfLoader.load();
for (const doc of docs) {
console.log(
`-- Parsing content from pg ${
doc.metadata?.loc?.pageNumber || "unknown"
} --`
);
if (!doc.pageContent || !doc.pageContent.length) continue;
pageContent.push(doc.pageContent);

for (let i = 1; i <= numPages; i++) {
console.log(`-- Parsing content from pg ${i} --`);
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const text = content.items.map((item) => item.str).join(" ");

if (text.length) {
pageContent.push(text);
}
}

if (!pageContent.length) {
Expand All @@ -36,13 +38,15 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
};
}

const content = pageContent.join("");
const content = pageContent.join(" ");
const metadata = await pdf.getMetadata();

const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
docAuthor: metadata?.info?.Creator || "no author found",
description: metadata?.info?.Title || "No description found.",
docSource: "pdf file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
Expand Down
59 changes: 58 additions & 1 deletion collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"@langchain/core" "~0.1"
js-tiktoken "^1.0.11"

"@mapbox/node-pre-gyp@^1.0.11":
"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
Expand Down Expand Up @@ -643,6 +643,15 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==

canvas@^2.11.0:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"

chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
Expand Down Expand Up @@ -892,6 +901,13 @@ decamelize@1.2.0:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==

decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"

decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
Expand Down Expand Up @@ -2154,6 +2170,11 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==

mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==

mimic-response@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
Expand Down Expand Up @@ -2287,6 +2308,11 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==

nan@^2.17.0:
version "2.20.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.20.0.tgz#08c5ea813dd54ed16e5bd6505bf42af4f7838ca3"
integrity sha512-bk3gXBZDGILuuo/6sKtr0DQmSThYHLtNCdSdXk9YkxD/jK6X2vmCyyXBBxyqZ4XcnzTyYEAThfX3DCEnLf6igw==

napi-build-utils@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
Expand Down Expand Up @@ -2615,6 +2641,18 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==

path2d-polyfill@^2.0.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/path2d-polyfill/-/path2d-polyfill-2.1.1.tgz#6098b7bf2fc24c306c6377bcd558b17ba437ea27"
integrity sha512-4Rka5lN+rY/p0CdD8+E+BFv51lFaFvJOrlOhyQ+zjzyQrzyh3ozmxd1vVGGDdIbUFSBtIZLSnspxTgPT0iJhvA==
dependencies:
path2d "0.1.1"

path2d@0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.1.1.tgz#d3c3886cd2252fb2a7830c27ea7bb9a862d937ea"
integrity sha512-/+S03c8AGsDYKKBtRDqieTJv2GlkMb0bWjnqOgtF6MkjdUQ9a8ARAtxWf9NgKLGm2+WQr6+/tqJdU8HNGsIDoA==

pdf-parse@^1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7"
Expand All @@ -2623,6 +2661,16 @@ pdf-parse@^1.1.1:
debug "^3.1.0"
node-ensure "^0.0.0"

pdfjs-dist@3.4.120:
version "3.4.120"
resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-3.4.120.tgz#6f4222117157498f179c95dc4569fad6336a8fdd"
integrity sha512-B1hw9ilLG4m/jNeFA0C2A0PZydjxslP8ylU+I4XM7Bzh/xWETo9EiBV848lh0O0hLut7T6lK1V7cpAXv5BhxWw==
dependencies:
path2d-polyfill "^2.0.1"
web-streams-polyfill "^3.2.1"
optionalDependencies:
canvas "^2.11.0"

peberminta@^0.9.0:
version "0.9.0"
resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352"
Expand Down Expand Up @@ -3048,6 +3096,15 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==

simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"

simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
Expand Down