Skip to content

Commit 495eab5

Browse files
committed
feat(web-scraper): add support for saving screenshots to a disk
1 parent 55588d1 commit 495eab5

File tree

6 files changed

+26
-5
lines changed

6 files changed

+26
-5
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dist/
88
.env
99

1010
components/retrack-types/Cargo.lock
11+
components/retrack-web-scraper/.screenshots
1112

1213
# Local Retrack configuration file
1314
retrack.toml

components/retrack-web-scraper/src/api/web_page/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,6 @@ export interface WorkerData {
5151
userAgent?: string;
5252
// Whether to ignore HTTPS errors when sending network requests.
5353
ignoreHTTPSErrors?: boolean;
54+
// Path to a folder where to save screenshots.
55+
screenshotsPath?: string;
5456
}

components/retrack-web-scraper/src/api/web_page/execute.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ interface RequestBodyType {
4545
ignoreHTTPSErrors?: boolean;
4646
}
4747

48-
export function registerExecuteRoutes({ server, getBrowserEndpoint }: ApiRouteParams) {
48+
export function registerExecuteRoutes({ config, server, getBrowserEndpoint }: ApiRouteParams) {
4949
return server.post<{ Body: RequestBodyType }>(
5050
'/api/web_page/execute',
5151
{
@@ -75,6 +75,7 @@ export function registerExecuteRoutes({ server, getBrowserEndpoint }: ApiRoutePa
7575
previousContent: request.body.previousContent,
7676
userAgent: request.body.userAgent,
7777
ignoreHTTPSErrors: request.body.ignoreHTTPSErrors,
78+
screenshotsPath: config.browserScreenshotsPath,
7879
};
7980

8081
try {
@@ -103,7 +104,7 @@ export function registerExecuteRoutes({ server, getBrowserEndpoint }: ApiRoutePa
103104
if (message.level === 'error') {
104105
workerLog.error(message.message, message.args);
105106
for (const [url, screenshot] of message.screenshots ?? []) {
106-
workerLog.error({ screenshot }, `Screenshot for ${url}.`);
107+
workerLog.error({ screenshot: Buffer.from(screenshot).toString('base64') }, `Screenshot for ${url}.`);
107108
}
108109
} else {
109110
workerLog.info(message.message, message.args);

components/retrack-web-scraper/src/api/web_page/worker.ts

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ if (!parentPort) {
1414
}
1515

1616
// Load the extractor script as an ES module.
17-
const { endpoint, extractor, tags, previousContent, userAgent, ignoreHTTPSErrors } = workerData as WorkerData;
17+
const { endpoint, extractor, tags, previousContent, userAgent, ignoreHTTPSErrors, screenshotsPath } =
18+
workerData as WorkerData;
1819

1920
// SECURITY: Basic prototype pollution protection against the most common vectors until we can use Playwright with
2021
// `--frozen-intrinsics`. It DOES NOT protect against all prototype pollution vectors.
@@ -112,7 +113,18 @@ try {
112113
pages.map((page) => ({ url: page.url() })),
113114
new Map(
114115
await Promise.all(
115-
pages.map(async (page) => [page.url(), await page.screenshot({ fullPage: true })] as [string, Uint8Array]),
116+
pages.map(
117+
async (page) =>
118+
[
119+
page.url(),
120+
await page.screenshot({
121+
fullPage: true,
122+
path: screenshotsPath
123+
? `${screenshotsPath}/screenshot_${encodeURIComponent(page.url())}_${Date.now()}.png`
124+
: undefined,
125+
}),
126+
] as [string, Uint8Array],
127+
),
116128
),
117129
),
118130
);

components/retrack-web-scraper/src/config.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export interface Config {
77
port: number;
88
cacheTTLSec: number;
99
browserTTLSec: number;
10+
browserScreenshotsPath?: string;
1011
userAgent?: string;
1112
}
1213

@@ -18,6 +19,7 @@ export function configure(): Config {
1819
port: +(process.env.RETRACK_WEB_SCRAPER_PORT ?? 0) || 7272,
1920
cacheTTLSec: +(process.env.RETRACK_WEB_SCRAPER_CACHE_TTL_SEC ?? 0) || 20 * 60,
2021
browserTTLSec: +(process.env.RETRACK_WEB_SCRAPER_BROWSER_TTL_SEC ?? 0) || 10 * 60,
22+
browserScreenshotsPath: process.env.RETRACK_WEB_SCRAPER_BROWSER_SCREENSHOTS_PATH,
2123
userAgent: process.env.RETRACK_WEB_SCRAPER_USER_AGENT,
2224
};
2325
}

components/retrack-web-scraper/src/index.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ const server = fastify({
1919
? { level: process.env.RETRACK_WEB_SCRAPER_LOG_LEVEL ?? 'debug' }
2020
: {
2121
level: process.env.RETRACK_WEB_SCRAPER_LOG_LEVEL ?? 'debug',
22-
transport: { target: 'pino-pretty', options: { translateTime: 'HH:MM:ss Z', ignore: 'pid,hostname' } },
22+
transport: {
23+
target: 'pino-pretty',
24+
options: { translateTime: 'HH:MM:ss Z', ignore: 'pid,hostname,screenshot' },
25+
},
2326
},
2427
}).addHook('onClose', () => stopBrowserServer());
2528

0 commit comments

Comments
 (0)