Skip to content

Commit

Permalink
improve gnocchi scans
Browse files Browse the repository at this point in the history
  • Loading branch information
a-type committed May 28, 2024
1 parent 27a2772 commit c1fe94d
Show file tree
Hide file tree
Showing 21 changed files with 928 additions and 722 deletions.
11 changes: 6 additions & 5 deletions apps/gnocchi/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
}
},
"scripts": {
"build": "pnpm run build:client && pnpm run build:server",
"build": "pnpm run build:client && pnpm run build:server && pnpm run typings",
"build:client": "vite build --outDir dist/client",
"build:server": "vite build --ssr src/entry-server.tsx --outDir dist/server && tsc --declaration --emitDeclarationOnly",
"dev": "pnpm run dev:client && pnpm run dev:server && tsc --declaration --emitDeclarationOnly",
"dev:client": "vite build --mode development --outDir dist/client",
"dev:server": "vite build --mode development --ssr src/entry-server.tsx --outDir dist/server",
"build:server": "vite build --ssr src/entry-server.tsx --outDir dist/server",
"dev": "concurrently npm:dev:client npm:dev:server npm:typings",
"dev:client": "vite build --mode development --outDir dist/client --watch",
"dev:server": "vite build --mode development --ssr src/entry-server.tsx --outDir dist/server --watch",
"typings": "tsc --declaration --emitDeclarationOnly",
"typecheck": "tsc --build tsconfig.json"
},
"dependencies": {
Expand Down
28 changes: 28 additions & 0 deletions apps/gnocchi/hub/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
import { Note } from '@a-type/ui/components/note';
import { H1, H2, P } from '@a-type/ui/components/typography';
import { HubRecipeData } from './types.js';
import { Chip } from '@a-type/ui/components/chip';

export function App({
recipe: data,
Expand Down Expand Up @@ -54,6 +55,33 @@ export function App({
<P itemProp="author" className="p-author">
Published by {data.publisher?.fullName ?? 'Anonymous'}
</P>
<div className="row flex-wrap">
{data.servings && <Chip>Serves {data.servings}</Chip>}
{data.prepTimeMinutes && (
<>
<Chip>Prep {data.prepTimeMinutes} min</Chip>
<span className="hidden" itemProp="prepTime">
P0Y0M0DT0H{data.prepTimeMinutes}M0S
</span>
</>
)}
{data.cookTimeMinutes && (
<>
<Chip>Cook {data.cookTimeMinutes} min</Chip>
<span className="hidden" itemProp="cookTime">
P0Y0M0DT0H{data.cookTimeMinutes}M0S
</span>
</>
)}
{data.totalTimeMinutes && (
<>
<Chip>Total {data.totalTimeMinutes} min</Chip>
<span className="hidden" itemProp="totalTime">
P0Y0M0DT0H{data.totalTimeMinutes}M0S
</span>
</>
)}
</div>
</TopLineTitle>
</TopLineRoot>
{data.note && <Note className="self-start">{data.note}</Note>}
Expand Down
6 changes: 4 additions & 2 deletions apps/gnocchi/hub/src/components/Instructions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ const SectionTitle = Node.create({
renderHTML({ HTMLAttributes }: any) {
return [
'h2',
mergeAttributes(this.options.HTMLAttributes, HTMLAttributes),
mergeAttributes(this.options.HTMLAttributes, HTMLAttributes, {
'data-section-title': true,
}),
0,
];
},
Expand All @@ -156,7 +158,7 @@ function InstructionStepView({
<NodeViewContent />
</div>
{node.attrs.note && (
<Note className="mt-2 ml-8 max-w-80% w-max-content">
<Note className="mt-2 ml-8 max-w-80% w-max-content" data-note="true">
{node.attrs.note}
</Note>
)}
Expand Down
56 changes: 53 additions & 3 deletions apps/gnocchi/hub/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,62 @@
export interface HubRecipeData {
id: string;
title: string;
prelude: any;
prelude: {
type: 'doc';
content: ParagraphNode[];
};
mainImageUrl: string;
ingredients: any[];
instructions: any[];
ingredients: GnocchiIngredient[];
// tiptap/prosemirror content
instructions: {
type: 'doc';
content: (StepNode | SectionTitleNode)[];
};
publisher: {
fullName: string;
};
note?: string;
servings?: number;
prepTimeMinutes?: number;
cookTimeMinutes?: number;
totalTimeMinutes?: number;
}

type GnocchiIngredient = {
text: string;
comments: string[];
quantity: number;
unit?: string;
isSectionHeader: boolean;
food: string;
id: string;
note: string | null;
};

type StepNode = {
type: 'step';
content: TextNode[];
attrs: {
id: string;
note?: string;
};
};

type SectionTitleNode = {
type: 'sectionTitle';
content: TextNode[];
attrs: {
id: string;
note?: string;
};
};

type TextNode = {
type: 'text';
text: string;
};

type ParagraphNode = {
type: 'paragraph';
content: TextNode[];
};
46 changes: 24 additions & 22 deletions apps/gnocchi/scanning/src/extractor.ts
Original file line number Diff line number Diff line change
@@ -1,40 +1,42 @@
import { Cheerio, CheerioAPI } from 'cheerio';

type UnwrapPromise<T extends Promise<any>> = T extends Promise<infer U>
? U
: never;
type UnwrapPromise<T extends Promise<any>> =
T extends Promise<infer U> ? U : never;

import * as extractors from './extractors/index.js';
import { ExtractorData } from './extractors/types.js';

type Extractor = ($: CheerioAPI) => Promise<ExtractorData | null>;

const extractorOrdering: [RegExp, Extractor][] = [
[/.*/, extractors.microdata],
[/.*/, extractors.schemaOrg],
[/.*/, extractors.wprm],
[/.*/, extractors.tasty],
[/.*/, extractors.naive],
[/gnocchi\.biscuits\.club/, extractors.gnocchi],
[/localhost:6124/, extractors.gnocchi],
[/.*/, extractors.microdata],
[/.*/, extractors.schemaOrg],
[/.*/, extractors.wprm],
[/.*/, extractors.tasty],
[/.*/, extractors.naive],
];

async function tryParse($: CheerioAPI, pageUrl: string) {
for (const [filter, extractor] of extractorOrdering) {
if (!filter.test(pageUrl)) {
continue;
}
const result = await extractor($);
if (result) {
return result;
}
}
for (const [filter, extractor] of extractorOrdering) {
if (!filter.test(pageUrl)) {
continue;
}
const result = await extractor($);
if (result) {
return result;
}
}
}

export async function extract($: CheerioAPI, pageUrl: string) {
const result = await tryParse($, pageUrl);
return {
...result,
url: result?.url || pageUrl,
};
const result = await tryParse($, pageUrl);
return {
scanner: 'none',
...result,
url: result?.url || pageUrl,
};
}

export type ScanResult = UnwrapPromise<ReturnType<typeof extract>>;
49 changes: 49 additions & 0 deletions apps/gnocchi/scanning/src/extractors/gnocchi.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { CheerioAPI } from 'cheerio';
import { ExtractorData } from './types.js';

const SNAPSHOT_MATCH = /window\.__SNAPSHOT__\s+=\s+(.*);/;

export async function gnocchi($: CheerioAPI): Promise<ExtractorData | null> {
const scripts = $('script');

let s = scripts.filter((i, el) => {
const text = $(el).text();
return SNAPSHOT_MATCH.test(text);
});

if (!s.length) {
return null;
}

const snapshot = SNAPSHOT_MATCH.exec($(s).text())?.[1];
if (!snapshot) {
return null;
}

const data = JSON.parse(snapshot);

return {
scanner: 'gnocchi',
title: data.title,
image: data.mainImageUrl,
author: data.publisher.fullName,
detailedIngredients: data.ingredients.map((i: any) => ({
original: i.text,
quantity: i.quantity,
unit: i.unit,
foodName: i.food,
note: i.note,
comments: i.comments,
})),
detailedSteps: data.instructions.content.map((i: any) => ({
type: i.type === 'sectionTitle' ? 'sectionTitle' : 'step',
content: i.content.reduce((acc: string, j: any) => acc + j.text, ''),
note: i.attrs.note,
})),
servings: data.servings,
prepTimeMinutes: data.prepTimeMinutes,
cookTimeMinutes: data.cookTimeMinutes,
totalTimeMinutes: data.totalTimeMinutes,
note: data.note,
};
}
1 change: 1 addition & 0 deletions apps/gnocchi/scanning/src/extractors/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ export * from './naive.js';
export * from './wprm.js';
export * from './schemaOrg.js';
export * from './tasty.js';
export * from './gnocchi.js';
121 changes: 62 additions & 59 deletions apps/gnocchi/scanning/src/extractors/microdata.ts
Original file line number Diff line number Diff line change
@@ -1,68 +1,71 @@
import { Cheerio, CheerioAPI, Element } from 'cheerio';
import {
detailedInstructionsToSimple,
extractNumber,
isoToMinutes,
parseInstructionInternalText,
detailedInstructionsToSimple,
extractNumber,
isoToMinutes,
parseInstructionInternalText,
} from './utils.js';
import { ExtractorData } from './types.js';

export async function microdata($: CheerioAPI): Promise<ExtractorData | null> {
let elems = $('[itemscope][itemtype="http://schema.org/Recipe"]');
if (elems.length === 0) {
elems = $('[itemscope][itemtype="https://schema.org/Recipe"]');
}
if (elems.length === 0) {
return null;
}
let elems = $('[itemscope][itemtype="http://schema.org/Recipe"]');
if (elems.length === 0) {
elems = $('[itemscope][itemtype="https://schema.org/Recipe"]');
}
if (elems.length === 0) {
return null;
}

const first = $(elems.get(0)!);
const name = first.find(' > [itemprop="name"]').text().trim();
const author = first.find('[itemprop="author"]').text().trim();
const copyrightHolder = first
.find('[itemprop="copyrightHolder"]')
.text()
.trim();
const copyrightYear = first.find('[itemprop="copyrightYear"]').text().trim();
const description = first.find('[itemprop="description"]').text().trim();
const image = first.find('[itemprop="image"]').attr('src');
const datePublished = first.find('[itemprop="datePublished"]').text().trim();
const cookTime = first.find('[itemprop="cookTime"]').text().trim();
const prepTime = first.find('[itemprop="prepTime"]').text().trim();
const totalTime = first.find('[itemprop="totalTime"]').text().trim();
const cookingMethod = first.find('[itemprop="cookingMethod"]').text().trim();
const recipeCategory = first
.find('[itemprop="recipeCategory"]')
.text()
.trim();
const recipeCuisine = first.find('[itemprop="recipeCuisine"]').text().trim();
const recipeYield = first.find('[itemprop="recipeYield"]').text().trim();
const recipeIngredient = first
.find('[itemprop="recipeIngredient"]')
.map((i, e) => $(e).text().trim())
.get();
const recipeInstructionElements = first
.find('[itemprop="recipeInstructions"]')
.get();
const recipeInstructionsDetailed = recipeInstructionElements
.map((e) => {
return parseInstructionInternalText($(e));
})
.flat();
const first = $(elems.get(0)!);
const name = first.find(' > [itemprop="name"]').text().trim();
const author = first.find('[itemprop="author"]').text().trim();
const copyrightHolder = first
.find('[itemprop="copyrightHolder"]')
.text()
.trim();
const copyrightYear = first.find('[itemprop="copyrightYear"]').text().trim();
const description = first.find('[itemprop="description"]').text().trim();
const image = first.find('[itemprop="image"]').attr('src');
const datePublished = first.find('[itemprop="datePublished"]').text().trim();
const cookTime = first.find('[itemprop="cookTime"]').text().trim();
const prepTime = first.find('[itemprop="prepTime"]').text().trim();
const totalTime = first.find('[itemprop="totalTime"]').text().trim();
const cookingMethod = first.find('[itemprop="cookingMethod"]').text().trim();
const recipeCategory = first
.find('[itemprop="recipeCategory"]')
.text()
.trim();
const recipeCuisine = first.find('[itemprop="recipeCuisine"]').text().trim();
const recipeYield = first.find('[itemprop="recipeYield"]').text().trim();
const recipeIngredient = first
.find('[itemprop="recipeIngredient"]')
.map((i, e) => $(e).text().trim())
.get();
const recipeInstructionElements = first
.find('[itemprop="recipeInstructions"]')
.get();
const recipeInstructionsDetailed = recipeInstructionElements
.map((e) => {
return parseInstructionInternalText($(e));
})
.flat();
const note = first.find('[itemprop="note"]').text().trim();

return {
title: name,
description,
image,
copyrightHolder,
copyrightYear,
author,
cookTimeMinutes: isoToMinutes(cookTime),
prepTimeMinutes: isoToMinutes(prepTime),
totalTimeMinutes: isoToMinutes(totalTime),
rawIngredients: recipeIngredient,
steps: detailedInstructionsToSimple(recipeInstructionsDetailed),
detailedSteps: recipeInstructionsDetailed,
servings: extractNumber(recipeYield),
};
return {
scanner: 'microdata',
title: name,
description,
image,
copyrightHolder,
copyrightYear,
author,
cookTimeMinutes: isoToMinutes(cookTime),
prepTimeMinutes: isoToMinutes(prepTime),
totalTimeMinutes: isoToMinutes(totalTime),
rawIngredients: recipeIngredient,
steps: detailedInstructionsToSimple(recipeInstructionsDetailed),
detailedSteps: recipeInstructionsDetailed,
servings: extractNumber(recipeYield),
note,
};
}
Loading

0 comments on commit c1fe94d

Please sign in to comment.