From 537c7728d001e0db27fb0137fcdea3d0dc220110 Mon Sep 17 00:00:00 2001 From: kentfitch Date: Wed, 11 Sep 2024 15:37:17 +1000 Subject: [PATCH] more changes to filter copyright, add openai api to get descriptions (you'll need a key) --- .gitignore | 2 + web/package-lock.json | 161 +++++++++++++++++++++++++++++++++++++++--- web/package.json | 1 + web/routes/admin.js | 30 ++++++-- 4 files changed, 182 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index cb79c7c..2d15c34 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,5 @@ copyright-check-Francis-set-suppressed-ignored.csv web/package-lock.json web/package.json +web/package-lock.json +web/package.json diff --git a/web/package-lock.json b/web/package-lock.json index e5ebb1c..846a12f 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -1,11 +1,11 @@ { - "name": "policydocs", + "name": "pictures semantic demo", "version": "1.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "policydocs", + "name": "pictures semantic demo", "version": "1.0.0", "license": "ISC", "dependencies": { @@ -23,6 +23,7 @@ "log4js": "^6.9.1", "moment": "^2.29.4", "morgan": "^1.10.0", + "openai": "^4.58.2", "pdf-text-reader": "^5.1.0", "pdf2html": "^3.1.0", "rotating-file-stream": "^3.1.1", @@ -50,12 +51,49 @@ "node-pre-gyp": "bin/node-pre-gyp" } }, + "node_modules/@types/node": { + "version": "18.19.50", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.50.tgz", + "integrity": "sha512-xonK+NRrMBRtkL1hVCc3G+uXtjh1Al4opBLjqVmipe5ZAaBYWW6cNAiBVZ1BvmkBhep698rP3UM3aRAdSALuhg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz", + "integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/qs": { + "version": "6.9.15", + "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.15.tgz", + "integrity": "sha512-uXHQKES6DQKKCLh441Xv/dwxOq1TVS3JPUMlEqoEglvlhR6Mxnlew/Xq/LRVHpLyk7iK3zODe1qYHIMltO7XGg==", + "license": "MIT" + }, "node_modules/abbrev": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", "integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==", "optional": true }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/accepts": { "version": "1.3.8", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", @@ -103,6 +141,18 @@ "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", "optional": true }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, "node_modules/ansi-regex": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", @@ -705,6 +755,15 @@ "node": ">= 0.6" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/express": { "version": "4.19.2", "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", @@ -856,6 +915,25 @@ "node": ">= 6" } }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, "node_modules/formidable": { "version": "3.5.1", "resolved": "https://registry.npmjs.org/formidable/-/formidable-3.5.1.tgz", @@ -1178,6 +1256,15 @@ "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", "optional": true }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -1490,11 +1577,29 @@ "node": ">= 0.6" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, "node_modules/node-fetch": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", - "optional": true, "dependencies": { "whatwg-url": "^5.0.0" }, @@ -1593,6 +1698,34 @@ "wrappy": "1" } }, + "node_modules/openai": { + "version": "4.58.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.58.2.tgz", + "integrity": "sha512-hIalypYELt7/PcryFpz4Gi1z/8ZDzukWyOhr+jKM6L/GVE+t4NseaENXKt+OxnkkIm/1R2EkdGxgnHrZ0kB5bQ==", + "license": "Apache-2.0", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "@types/qs": "^6.9.15", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7", + "qs": "^6.10.3" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, "node_modules/parse5": { "version": "7.1.2", "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", @@ -2139,8 +2272,7 @@ "node_modules/tr46": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", - "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", - "optional": true + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" }, "node_modules/type-is": { "version": "1.6.18", @@ -2154,6 +2286,12 @@ "node": ">= 0.6" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/universalify": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz", @@ -2197,17 +2335,24 @@ "node": ">= 0.8" } }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, "node_modules/webidl-conversions": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", - "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", - "optional": true + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" }, "node_modules/whatwg-url": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", - "optional": true, "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" diff --git a/web/package.json b/web/package.json index 7b61e17..06887f5 100644 --- a/web/package.json +++ b/web/package.json @@ -23,6 +23,7 @@ "log4js": "^6.9.1", "moment": "^2.29.4", "morgan": "^1.10.0", + "openai": "^4.58.2", "pdf-text-reader": "^5.1.0", "pdf2html": "^3.1.0", "rotating-file-stream": "^3.1.1", diff --git a/web/routes/admin.js b/web/routes/admin.js index 22b8602..ef827c3 100644 --- a/web/routes/admin.js +++ b/web/routes/admin.js @@ -109,6 +109,7 @@ async function addOpenAIDescriptions(req, res) { let count = 0 ; let alreadyGotDesc = 0 ; let descAdded = 0 ; + let errors = 0 ; let solrRes = await axios.get(appConfig.solr.getSolrBaseUrl() + "pictures/select" + "?wt=json&rows=9999&fl=id,url,title,bibId,processingStatus,openAIDescription,copyright" + @@ -139,6 +140,8 @@ async function addOpenAIDescriptions(req, res) { let instructions = "Please describe this image." ; if (doc.title) instructions += " For reference, this is the title of the image: " + doc.title.replace("[picture]", "") ; + let openAIDescription = null ; + try { const completion = await openai.chat.completions.create({ model:"gpt-4o", messages:[ @@ -152,10 +155,28 @@ async function addOpenAIDescriptions(req, res) { console.log("COMPLETION " + JSON.stringify(completion)) ; - let openAIDescription = completion.choices[0].message.content ; + openAIDescription = completion.choices[0].message.content ; + } + catch (oe) { + + res.write("openAIDescription Error " + oe + "\n") ; + console.log("openAIDescription Error " + oe) ; + console.log(oe.stack) ; + } res.write("doc " + doc.id + " openAIDescription: " + openAIDescription + "\n") ; + + if (!openAIDescription) { // error... + let updatedFields = { + processingStatus: "error getting openAIdescr" + } ; + if (!doc.copyright) // fix bug + updatedFields.copyright = "Out of Copyright" ; + await updateDoc(doc.id, updatedFields) ; + errors++ ; + continue ; + } let openaiDescriptionVector = await util.getEmbedding(openAIDescription) ; // console.log("got embedding") ; @@ -172,8 +193,8 @@ async function addOpenAIDescriptions(req, res) { } - res.write("\n done count " + count + " alreadyGotDesc " + alreadyGotDesc + " descAdded " + descAdded) ; - console.log("\n done count " + count + " alreadyGotDesc " + alreadyGotDesc + " descAdded " + descAdded) ; + res.write("\n done count " + count + " alreadyGotDesc " + alreadyGotDesc + " descAdded " + descAdded + " errors " + errors) ; + console.log("\n done count " + count + " alreadyGotDesc " + alreadyGotDesc + " descAdded " + descAdded + " errors " + errors) ; } @@ -372,7 +393,8 @@ async function generatePhi35DescriptionForOAimageDescriptions(req, res) { // REAL WAS : "?wt=json&rows=999999&fl=id,url,title,suppressed,manuallyForcedUnsuppressed&sort=id asc&q=id: {\"" + lastId + "\" TO \"z\"]") ; //fix up first run - those without v12 were not given title to help! - "?wt=json&rows=999999&fl=id,url,title,suppressed,manuallyForcedUnsuppressed&sort=id asc&q=openAIDescription:* AND msVision35Description:*") ; +// another run - gen for those with no vision35 yet 11sep24 + "?wt=json&rows=999999&fl=id,url,title,suppressed,manuallyForcedUnsuppressed&sort=id asc&q=openAIDescription:* AND -msVision35Description:*") ; if (!((solrRes.status == 200) && solrRes.data && solrRes.data.response)) { res.write(" Failed to find any records, status: " + solrRes.status + "\n") ; if (solrRes.data) res.write(" Solr data: " + JSON.stringify(solrRes.data) + "\n") ;