θΏ™ζ˜―indexlocζδΎ›ηš„ζœεŠ‘οΌŒδΈθ¦θΎ“ε…₯任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@
"Milvus",
"Mintplex",
"moderations",
"numpages",
"Ollama",
"Oobabooga",
"openai",
"opendocument",
"openrouter",
"pagerender",
"Qdrant",
"searxng",
"Serper",
Expand Down
2 changes: 1 addition & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"node-html-parser": "^6.1.13",
"officeparser": "^4.0.5",
"openai": "4.38.5",
"pdfjs-dist": "3.4.120",
"pdf-parse": "^1.1.1",
"puppeteer": "~21.5.2",
"slugify": "^1.6.6",
"url-pattern": "^1.0.3",
Expand Down
62 changes: 62 additions & 0 deletions collector/processSingleFile/convert/asPDF/PDFLoader/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
const fs = require("fs").promises;
const pdf = require("pdf-parse");

class PDFLoader {
constructor(filePath, { splitPages = true } = {}) {
this.filePath = filePath;
this.splitPages = splitPages;
}

async load() {
const buffer = await fs.readFile(this.filePath);

const options = {
pagerender: this.splitPages ? this.renderPage : null,
};

const { text, numpages, info, metadata, version } = await pdf(
buffer,
options
);

if (!this.splitPages) {
return [
{
pageContent: text.trim(),
metadata: {
source: this.filePath,
pdf: { version, info, metadata, totalPages: numpages },
},
},
];
}

return this.pages.map((pageContent, index) => ({
pageContent: pageContent.trim(),
metadata: {
source: this.filePath,
pdf: { version, info, metadata, totalPages: numpages },
loc: { pageNumber: index + 1 },
},
}));
}

pages = [];

renderPage = async (pageData) => {
const textContent = await pageData.getTextContent();
let lastY,
text = "";
for (const item of textContent.items) {
if (lastY !== item.transform[5] && lastY !== undefined) {
text += "\n";
}
text += item.str;
lastY = item.transform[5];
}
this.pages.push(text);
return text;
};
}

module.exports = PDFLoader;
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@ const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
} = require("../../../utils/files");
const { tokenizeString } = require("../../../utils/tokenizer");
const { default: slugify } = require("slugify");
const PDFLoader = require("./PDFLoader");

async function asPDF({ fullFilePath = "", filename = "" }) {
const pdfjsLib = await import("pdfjs-dist");
console.log(`-- Working ${filename} --`);

const loadingTask = pdfjsLib.default.getDocument(fullFilePath);
const pdf = await loadingTask.promise;
async function asPdf({ fullFilePath = "", filename = "" }) {
const pdfLoader = new PDFLoader(fullFilePath, {
splitPages: true,
});

const numPages = pdf.numPages;
console.log(`-- Working ${filename} --`);
const pageContent = [];
const docs = await pdfLoader.load();

for (let i = 1; i <= numPages; i++) {
console.log(`-- Parsing content from pg ${i} --`);
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const text = content.items.map((item) => item.str).join(" ");

if (text.length) {
pageContent.push(text);
}
for (const doc of docs) {
console.log(
`-- Parsing content from pg ${
doc.metadata?.loc?.pageNumber || "unknown"
} --`
);
if (!doc.pageContent || !doc.pageContent.length) continue;
pageContent.push(doc.pageContent);
}

if (!pageContent.length) {
Expand All @@ -38,15 +37,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
};
}

const content = pageContent.join(" ");
const metadata = await pdf.getMetadata();

const content = pageContent.join("");
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: metadata?.info?.Creator || "no author found",
description: metadata?.info?.Title || "No description found.",
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
docSource: "pdf file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
Expand All @@ -64,4 +61,4 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
return { success: true, reason: null, documents: [document] };
}

module.exports = asPDF;
module.exports = asPdf;
2 changes: 1 addition & 1 deletion collector/utils/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".rst": "./convert/asTxt.js",

".html": "./convert/asTxt.js",
".pdf": "./convert/asPDF.js",
".pdf": "./convert/asPDF/index.js",

".docx": "./convert/asDocx.js",
".pptx": "./convert/asOfficeMime.js",
Expand Down
59 changes: 1 addition & 58 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
"@langchain/core" "~0.1"
js-tiktoken "^1.0.11"

"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
"@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
Expand Down Expand Up @@ -662,15 +662,6 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==

canvas@^2.11.0:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"

chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
Expand Down Expand Up @@ -936,13 +927,6 @@ decamelize@1.2.0:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==

decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"

decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
Expand Down Expand Up @@ -2237,11 +2221,6 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==

mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==

mimic-response@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
Expand Down Expand Up @@ -2375,11 +2354,6 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==

nan@^2.17.0:
version "2.20.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.20.0.tgz#08c5ea813dd54ed16e5bd6505bf42af4f7838ca3"
integrity sha512-bk3gXBZDGILuuo/6sKtr0DQmSThYHLtNCdSdXk9YkxD/jK6X2vmCyyXBBxyqZ4XcnzTyYEAThfX3DCEnLf6igw==

napi-build-utils@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
Expand Down Expand Up @@ -2715,18 +2689,6 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==

path2d-polyfill@^2.0.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/path2d-polyfill/-/path2d-polyfill-2.1.1.tgz#6098b7bf2fc24c306c6377bcd558b17ba437ea27"
integrity sha512-4Rka5lN+rY/p0CdD8+E+BFv51lFaFvJOrlOhyQ+zjzyQrzyh3ozmxd1vVGGDdIbUFSBtIZLSnspxTgPT0iJhvA==
dependencies:
path2d "0.1.1"

path2d@0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.1.1.tgz#d3c3886cd2252fb2a7830c27ea7bb9a862d937ea"
integrity sha512-/+S03c8AGsDYKKBtRDqieTJv2GlkMb0bWjnqOgtF6MkjdUQ9a8ARAtxWf9NgKLGm2+WQr6+/tqJdU8HNGsIDoA==

pdf-parse@^1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7"
Expand All @@ -2735,16 +2697,6 @@ pdf-parse@^1.1.1:
debug "^3.1.0"
node-ensure "^0.0.0"

pdfjs-dist@3.4.120:
version "3.4.120"
resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-3.4.120.tgz#6f4222117157498f179c95dc4569fad6336a8fdd"
integrity sha512-B1hw9ilLG4m/jNeFA0C2A0PZydjxslP8ylU+I4XM7Bzh/xWETo9EiBV848lh0O0hLut7T6lK1V7cpAXv5BhxWw==
dependencies:
path2d-polyfill "^2.0.1"
web-streams-polyfill "^3.2.1"
optionalDependencies:
canvas "^2.11.0"

peberminta@^0.9.0:
version "0.9.0"
resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352"
Expand Down Expand Up @@ -3175,15 +3127,6 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==

simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"

simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
Expand Down