Skip to content

Commit

Permalink
Merge pull request FlowiseAI#1687 from 0xi4o/bug/scrap-limit
Browse files Browse the repository at this point in the history
Fix: relative links method and limit not applying to manage links
  • Loading branch information
0xi4o authored Feb 12, 2024
2 parents d5d690b + 0ddae82 commit 4e8bf49
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 17 deletions.
6 changes: 4 additions & 2 deletions packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,9 @@ class Cheerio_DocumentLoaders implements INode {
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
selectedLinks && selectedLinks.length > 0
Expand All @@ -143,7 +145,7 @@ class Cheerio_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await cheerioLoader(page)))
}
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ class Playwright_DocumentLoaders implements INode {
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
selectedLinks && selectedLinks.length > 0
Expand All @@ -184,7 +186,7 @@ class Playwright_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await playwrightLoader(page)))
}
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ class Puppeteer_DocumentLoaders implements INode {
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
// so when limit is 0 we can fetch all the links
if (limit === null || limit === undefined) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
selectedLinks && selectedLinks.length > 0
Expand All @@ -185,7 +187,7 @@ class Puppeteer_DocumentLoaders implements INode {
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await puppeteerLoader(page)))
}
} else {
Expand Down
8 changes: 7 additions & 1 deletion packages/server/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1149,8 +1149,14 @@ export class App {
this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => {
const url = decodeURIComponent(req.query.url as string)
const relativeLinksMethod = req.query.relativeLinksMethod as string
if (!relativeLinksMethod) {
return res.status(500).send('Please choose a Relative Links Method in Additional Parameters.')
}

const limit = parseInt(req.query.limit as string)
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0)
const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, limit) : await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)

res.json({ status: 'OK', links })
})
Expand Down
6 changes: 3 additions & 3 deletions packages/ui/src/api/scraper.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import client from './client'

const fetchAllLinks = (url, relativeLinksMethod) =>
client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`)
const fetchLinks = (url, relativeLinksMethod, relativeLinksLimit) =>
client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}&limit=${relativeLinksLimit}`)

export default {
fetchAllLinks
fetchLinks
}
50 changes: 45 additions & 5 deletions packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,31 @@ import {
Stack,
Typography
} from '@mui/material'
import { IconTrash } from '@tabler/icons'
import { IconTrash, IconX } from '@tabler/icons'
import PerfectScrollbar from 'react-perfect-scrollbar'

import { BackdropLoader } from 'ui-component/loading/BackdropLoader'
import { StyledButton } from 'ui-component/button/StyledButton'

import scraperApi from 'api/scraper'

import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG } from 'store/actions'
import useNotifier from 'utils/useNotifier'

import {
HIDE_CANVAS_DIALOG,
SHOW_CANVAS_DIALOG,
enqueueSnackbar as enqueueSnackbarAction,
closeSnackbar as closeSnackbarAction
} from 'store/actions'

const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
const portalElement = document.getElementById('portal')
const dispatch = useDispatch()

useNotifier()
const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args))
const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args))

const [loading, setLoading] = useState(false)
const [selectedLinks, setSelectedLinks] = useState([])
const [url, setUrl] = useState('')
Expand All @@ -53,9 +64,38 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {

const handleFetchLinks = async () => {
setLoading(true)
const fetchLinksResp = await scraperApi.fetchAllLinks(url, 'webCrawl')
if (fetchLinksResp.data) {
setSelectedLinks(fetchLinksResp.data.links)
try {
const fetchLinksResp = await scraperApi.fetchLinks(url, dialogProps.relativeLinksMethod, dialogProps.limit)
if (fetchLinksResp.data) {
setSelectedLinks(fetchLinksResp.data.links)
enqueueSnackbar({
message: 'Successfully fetched links',
options: {
key: new Date().getTime() + Math.random(),
variant: 'success',
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
}
} catch (error) {
const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}`
enqueueSnackbar({
message: errorData,
options: {
key: new Date().getTime() + Math.random(),
variant: 'error',
persist: true,
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
}
setLoading(false)
}
Expand Down
8 changes: 6 additions & 2 deletions packages/ui/src/views/canvas/NodeInputHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,11 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
}
}

const onManageLinksDialogClicked = (url, selectedLinks) => {
const onManageLinksDialogClicked = (url, selectedLinks, relativeLinksMethod, limit) => {
const dialogProps = {
url,
relativeLinksMethod,
limit,
selectedLinks,
confirmButtonName: 'Save',
cancelButtonName: 'Cancel'
Expand Down Expand Up @@ -475,7 +477,9 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
onClick={() =>
onManageLinksDialogClicked(
data.inputs[inputParam.name] ?? inputParam.default ?? '',
data.inputs.selectedLinks
data.inputs.selectedLinks,
data.inputs['relativeLinksMethod'] ?? 'webCrawl',
parseInt(data.inputs['limit']) ?? 0
)
}
>
Expand Down

0 comments on commit 4e8bf49

Please sign in to comment.