Ot Aiops
支持多协议工业设备数据采集与智能诊断,具备高风险写入防护机制。
基于Puppeteer的网页数据抓取与浏览器自动化工具,支持动态内容提取和页面截图。
openclaw skills install @fasjdas/browser-automation-puppeteer命令、参数、文件名以原文为准
基于 Puppeteer 的网页抓取与浏览器自动化。
✅ 建议使用此技能的情况:
❌ 不建议使用此技能的情况:
web_fetch 更合适# 安装 Puppeteer
npm install puppeteer
# 基础抓取示例
node scripts/scrape.js https://example.comconst puppeteer = require('puppeteer');
async function scrape(url) {
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// ... 提取数据 ...
await browser.close();
}// 获取选择器下的所有文本
const titles = await page.$$eval('h2', els => els.map(el => el.textContent.trim()));
// 获取单个元素的文本
const price = await page.$eval('.price', el => el.textContent.trim());const html = await page.$eval('.product-list', el => el.innerHTML);const links = await page.$$eval('a', els => els.map(el => ({
text: el.textContent.trim(),
href: el.getAttribute('href')
})));// 等待指定选择器出现
await page.waitForSelector('.results', { timeout: 10000 });
// 等待网络空闲(推荐)
await page.goto(url, { waitUntil: 'networkidle2' });
// 等待自定义函数返回 true
await page.waitForFunction(() => document.querySelectorAll('.item').length > 10);async function scrapeWithPagination(baseUrl, maxPages = 5) {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
let results = [];
for (let i = 1; i <= maxPages; i++) {
const url = `${baseUrl}?page=${i}`;
await page.goto(url, { waitUntil: 'networkidle2' });
const items = await page.$$eval('.item', els =>
els.map(el => el.textContent.trim())
);
if (items.length === 0) break;
results.push(...items);
}
await browser.close();
return results;
}// 全页截图
await page.screenshot({ path: 'screenshot.png', fullPage: true });
// 元素截图
const element = await page.$('.chart');
await element.screenshot({ path: 'chart.png' });await page.setRequestInterception(true);
page.on('request', req => {
if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});// 使用方式:node scripts/scrape.js <url> [selector]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const selector = process.argv[3] || 'body';
if (!url) {
console.error('使用方式:node scrape.js <url> [selector]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
const content = await page.$$eval(selector, els =>
els.map(el => el.textContent.trim())
);
console.log(JSON.stringify(content, null, 2));
await browser.close();
})();// 使用方式:node scripts/screenshot.js <url> [output.png]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const output = process.argv[3] || 'screenshot.png';
if (!url) {
console.error('使用方式:node screenshot.js <url> [output.png]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
await page.screenshot({ path: output, fullPage: true });
console.log(`截图已保存至 ${output}`);
await browser.close();
})();// 使用方式:node crawl.js <url> <selector> [maxPages]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const selector = process.argv[3];
const maxPages = parseInt(process.argv[4]) || 10;
if (!url || !selector) {
console.error('使用方式:node crawl.js <url> <selector> [maxPages]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
let allData = [];
for (let i = 1; i <= maxPages; i++) {
const pageUrl = url.includes('?') ? `${url}&page=${i}` : `${url}?page=${i}`;
console.error(`正在爬取:${pageUrl}`);
await page.goto(pageUrl, { waitUntil: 'networkidle2' });
const data = await page.$$eval(selector, els =>
els.map(el => el.textContent.trim())
);
if (data.length === 0) break;
allData.push(...data);
}
console.log(JSON.stringify(allData, null, 2));
await browser.close();
})();| 目标 | 选择器 |
|---|---|
| 所有链接 | a |
| 所有图片 | img |
| 标题 | h1, h2, h3 |
| 列表项 | ul li, ol li |
| 表格行 | table tr |
| 卡片/项目 | .item, .card, .product |
| 价格信息 | .price, [class*="price"] |
| 描述内容 | .description, .summary |
curl example.com/robots.txtawait new Promise(r => setTimeout(r, 2000))networkidle2** await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');有关详细的 Puppeteer API 信息,请参阅 [puppeteer/docs/api.md](references/puppeteer-api.md)。
已收录 1 个 Skill