page.setRequestInterception(true)拦截器的使用方法和场景
现附上Puppeteer的Api的链接
https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md
<https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md>
实用场景(没错就是实用):比如我用Puppeteer模拟某个网页,然后我只想抓到这条网页的url的response的内容,或者我需要截图或者生成PDF但是只要文件我就可以过滤掉后缀是图片的url
使用的api:定位到api的链接
https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#class-request
<https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#class-request>
主要是class: Request 和 class: Response 两大块相结合
官方例子参考1:
await page.setRequestInterception(true); page.on('request', request => {
request.respond({ status: 404, contentType: 'text/plain', body: 'Not Found!'
}); });
实际使用的例子参考1:
'use strict'; const puppeteer = require('puppeteer'); (async () => { const
browser = await puppeteer.launch({ ignoreHTTPSErrors: true, headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'], }).catch(() =>
browser.close); const page = await browser.newPage(); let response = await
page.goto('http://www.google.com'); response.json();// 将response.body 转成json。
console.log(response); await page.setRequestInterception(true);
page.on('requestfailed', request => { console.log(request.url() + ' ' +
request.failure().errorText); }); // response.ok(); // 返回一个boolean值
如果状态码为200-299则为true, 其他则为false. // response.status; // 返回状态码 //
response.text(); // 返回 response body. // response.headers // 返回 HTTP headers
await browser.close(); })();
实际参考例子2:
'use strict'; const puppeteer = require('puppeteer'); (async () => { try {
const browser = await puppeteer.launch({ ignoreHTTPSErrors: true, headless:
false, args: ['--no-sandbox', '--disable-setuid-sandbox'], }).catch(() =>
browser.close); const page = await browser.newPage(); await
page.setRequestInterception(true); var num = 0; await page.on('request',
request => { //这就是请求的类型如果是图片类型的话执行abort拦截操作 否则continue继续请求别的 if
(request.resourceType() === 'image') { console.log(num + "image: "); let res =
request.response(); console.log(request.url); console.log(res); num++;
request.abort(); } else { // request.respond({ // status: 200, // contentType:
'text/plain', // body: 'GOOD!' // }); console.log("continue")
request.continue(); } }); // page.on('request', request => { // if
(request.resourceType() === 'image') // request.abort(); // else //
request.continue(); // }); await page.goto('https://news.google.com/news/');
await page.screenshot({path: 'news.png', fullPage: true}); await
browser.close(); } catch (e) { console.log(e); } })();
基本代码注释也很清楚也很容易理解,这是两个比较常用的例子,接下来是实战中更加常用的实用性例子
实际参考例子3(重点):
/** * 获取拦截某条url内容的 * @param page * @returns {Promise<any | never>} */ async
function getResponseMsg(page) { return new Promise((resolve, reject) => {
page.on('request', request => { if (request.url() === 'https://test.do') {
console.log(request.url()); console.log("拦截到了这条url然后就该请求了");
page.on('response', response => { if (response.url() === 'https://test.do') {
const req = response.request(); console.log("Response 的:" + req.method,
response.status, req.url); let message = response.text(); message.then(function
(result1) { results = result1; resolve(results); }); } }); request.continue();
} else { console.log(request.url()); console.log("continue");
request.continue(); } }); }).catch(new Function()).then(); }
稍微解释下上面这个例子,就是拦截拿到内容 然后返回 代码也清晰不多累赘 全是爬坑干货 欢迎一起爬坑
热门工具 换一换