Skip to content

Commit 07b0ded

Browse files
committed
feat(web-scraper): add full support for Playwright user content extractors
1 parent 93acc59 commit 07b0ded

30 files changed

+1416
-1095
lines changed

.nvmrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
20
1+
22

components/web_scraper/package.json

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
},
88
"version": "0.0.1",
99
"engines": {
10-
"node": "20.x"
10+
"node": "22.x"
1111
},
1212
"type": "module",
1313
"main": "dist/src/index.js",
@@ -17,8 +17,8 @@
1717
"lint": "eslint .",
1818
"lint:fix": "eslint . --fix",
1919
"watch": "NODE_OPTIONS=--import=./register.js RETRACK_WEB_SCRAPER_ENV_PATH=../../.env nodemon src/index.ts",
20-
"test": "NODE_OPTIONS=--import=./register.js NODE_NO_WARNINGS=1 node --test ./src/tests.ts",
21-
"test:watch": "NODE_OPTIONS=--import=./register.js NODE_NO_WARNINGS=1 node --test --watch ./src/tests.ts"
20+
"test": "NODE_OPTIONS=--import=./register.js NODE_NO_WARNINGS=1 node --experimental-test-module-mocks --test ./src/tests.ts",
21+
"test:watch": "NODE_OPTIONS=--import=./register.js NODE_NO_WARNINGS=1 node --experimental-test-module-mocks --test --watch ./src/tests.ts"
2222
},
2323
"dependencies": {
2424
"@fastify/compress": "^7.0.3",
@@ -27,14 +27,15 @@
2727
"fastify": "^4.28.1",
2828
"js-beautify": "^1.15.1",
2929
"node-cache": "^5.1.2",
30-
"playwright": "1.45.1"
30+
"playwright-core": "1.45.1"
3131
},
3232
"devDependencies": {
3333
"@eslint/eslintrc": "^3.1.0",
34-
"@eslint/js": "^9.9.0",
34+
"@eslint/js": "^9.9.1",
3535
"@types/js-beautify": "^1.14.3",
36-
"@types/node": "^22.2.0",
37-
"eslint": "^9.9.0",
36+
"@types/node": "^22.5.1",
37+
"@types/ws": "^8.5.12",
38+
"eslint": "^9.9.1",
3839
"eslint-config-prettier": "^9.1.0",
3940
"eslint-plugin-prettier": "^5.2.1",
4041
"globals": "^15.9.0",
@@ -43,6 +44,7 @@
4344
"prettier": "^3.3.3",
4445
"ts-node": "^10.9.2",
4546
"typescript": "^5.5.4",
46-
"typescript-eslint": "^8.1.0"
47+
"typescript-eslint": "^8.3.0",
48+
"ws": "^8.18.0"
4749
}
4850
}

components/web_scraper/src/api/api_result.ts

Lines changed: 0 additions & 1 deletion
This file was deleted.

components/web_scraper/src/api/api_route_params.mocks.ts

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,24 @@ import { mock } from 'node:test';
22

33
import { fastify } from 'fastify';
44
import NodeCache from 'node-cache';
5-
import type { Browser } from 'playwright';
65

76
import type { Config } from '../config.js';
87
import { configure } from '../config.js';
9-
import type { BrowserInfo } from '../index.js';
10-
import { createBrowserMock } from '../mocks.js';
8+
import type { BrowserEndpoint } from '../utilities/browser.js';
119

1210
interface MockOptions {
13-
browser?: Browser;
14-
browserInfo?: BrowserInfo;
11+
browserEndpoint?: BrowserEndpoint;
1512
config?: Config;
1613
}
1714

1815
export function createMock({
19-
browser = createBrowserMock() as unknown as Browser,
2016
config = configure(),
21-
browserInfo = { running: false, contexts: [] },
17+
browserEndpoint = { url: 'ws://localhost:3000', protocol: 'playwright' },
2218
}: MockOptions = {}) {
2319
return {
2420
server: fastify({ logger: { level: 'warn' } }),
2521
cache: new NodeCache({ stdTTL: 0 }),
2622
config,
27-
acquireBrowser: mock.fn(() => Promise.resolve(browser)),
28-
browserInfo: mock.fn(() => browserInfo),
23+
getBrowserEndpoint: mock.fn(() => Promise.resolve(browserEndpoint)),
2924
};
3025
}
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
import type { FastifyInstance } from 'fastify';
22
import type NodeCache from 'node-cache';
3-
import type { Browser } from 'playwright';
43

54
import type { Config } from '../config.js';
6-
import type { BrowserInfo } from '../index.js';
5+
import type { BrowserEndpoint } from '../utilities/browser.js';
76

87
export interface ApiRouteParams {
98
server: FastifyInstance;
109
cache: NodeCache;
1110
config: Config;
12-
acquireBrowser: () => Promise<Browser>;
13-
browserInfo: () => BrowserInfo;
11+
getBrowserEndpoint: (options?: { launchServer?: boolean }) => Promise<BrowserEndpoint>;
1412
}

components/web_scraper/src/api/diagnostics.ts

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,4 @@
1-
import type { FastifyBaseLogger } from 'fastify';
2-
import type { Browser } from 'playwright';
3-
41
export class Diagnostics {
5-
public static async screenshot(log: FastifyBaseLogger, browser: Browser) {
6-
log.info('Capturing screenshots...');
7-
if (!browser.isConnected()) {
8-
log.error('Browser is not connected, bailing out...');
9-
return;
10-
}
11-
12-
try {
13-
const pages = browser.contexts().flatMap((context) => context.pages());
14-
log.debug(`Retrieved ${pages.length} pages.`);
15-
for (const page of pages) {
16-
if (page.isClosed()) {
17-
log.debug(`Page is closed: ${page.url()}.`);
18-
} else if (page.url() === 'about:blank') {
19-
log.debug(`Skipping page: ${page.url()}.`);
20-
} else {
21-
log.info(`Making screenshot ${page.url()}.`);
22-
log.error(
23-
{
24-
screenshot: Buffer.from((await page.screenshot({ fullPage: true })).toString('base64'), 'base64'),
25-
},
26-
`Screenshot is made ${page.url()}.`,
27-
);
28-
}
29-
}
30-
} catch (err) {
31-
log.error('Failed to capture screenshots', err);
32-
}
33-
}
34-
352
public static errorMessage(err: unknown): string {
363
if (typeof err === 'string') {
374
return err;

components/web_scraper/src/api/status/get.test.ts

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,14 @@ await test('[/api/status] can successfully create route', () => {
1010

1111
await test('[/api/status] returns version from the config', async () => {
1212
const configMock = { version: '1.0.0-rc.100', browserTTLSec: 1, cacheTTLSec: 2, port: 3 };
13-
const browserInfoMock = {
14-
running: true,
15-
name: 'chromium',
16-
version: '1.0.0',
17-
contexts: [{ pages: ['https://retrack.dev'] }],
18-
};
19-
const response = await registerStatusGetRoutes(
20-
createMock({ config: configMock, browserInfo: browserInfoMock }),
21-
).inject({
13+
const response = await registerStatusGetRoutes(createMock({ config: configMock })).inject({
2214
method: 'GET',
2315
url: '/api/status',
2416
});
2517

26-
assert.strictEqual(response.body, JSON.stringify({ version: configMock.version, browser: browserInfoMock }));
18+
assert.strictEqual(
19+
response.body,
20+
JSON.stringify({ version: configMock.version, browser: { protocol: 'playwright', url: 'ws://localhost:3000' } }),
21+
);
2722
assert.strictEqual(response.statusCode, 200);
2823
});

components/web_scraper/src/api/status/get.ts

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import type { ApiRouteParams } from '../api_route_params.js';
22

3-
export function registerStatusGetRoutes({ server, config, browserInfo }: ApiRouteParams) {
3+
export function registerStatusGetRoutes({ server, config, getBrowserEndpoint }: ApiRouteParams) {
44
// Register a route that returns the status of the Web Scraper component.
55
return server.get(
66
'/api/status',
@@ -14,24 +14,19 @@ export function registerStatusGetRoutes({ server, config, browserInfo }: ApiRout
1414
browser: {
1515
type: 'object',
1616
properties: {
17-
running: { type: 'boolean' },
18-
name: { type: 'string', nullable: true },
19-
version: { type: 'string', nullable: true },
20-
contexts: {
21-
type: 'array',
22-
items: { type: 'object', properties: { pages: { type: 'array', items: { type: 'string' } } } },
23-
},
17+
protocol: { type: 'string' },
18+
url: { type: 'string', nullable: true },
2419
},
2520
},
2621
},
2722
},
2823
},
2924
},
3025
},
31-
() => {
26+
async () => {
3227
return {
3328
version: config.version,
34-
browser: browserInfo(),
29+
browser: await getBrowserEndpoint({ launchServer: false }),
3530
};
3631
},
3732
);
Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,36 @@
11
/**
2-
* Default timeout for the page load, in ms.
2+
* Default timeout for the user scenario, in ms.
33
*/
4-
export const DEFAULT_TIMEOUT_MS = 10000;
4+
export const DEFAULT_USER_SCRIPT_TIMEOUT_MS = 30000;
5+
6+
// Every user scenario is represented as an ES module that is prefixed with this string.
7+
export const USER_MODULE_PREFIX = 'data:text/javascript,void("retrack");';
8+
9+
/**
10+
* Represents the type of message that can be sent from the worker to the main thread.
11+
*/
12+
export enum WorkerMessageType {
13+
LOG = 'log',
14+
RESULT = 'error',
15+
}
16+
17+
/**
18+
* Represents a log message that can be sent from the worker to the main thread.
19+
*/
20+
export interface WorkerLogMessage {
21+
type: WorkerMessageType.LOG;
22+
message: string;
23+
level?: string;
24+
screenshots?: Map<string, Uint8Array>;
25+
args?: ReadonlyArray<object>;
26+
}
27+
28+
/**
29+
* Represents a result message that can be sent from the worker to the main thread.
30+
*/
31+
export interface WorkerResultMessage {
32+
type: WorkerMessageType.RESULT;
33+
content: { type: WorkerStringResultType; value: string };
34+
}
35+
36+
export type WorkerStringResultType = 'html' | 'text' | 'json';

0 commit comments

Comments
 (0)