// You will need to install and run Node.js prior to setting up
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/div[1]/div[2]/div[9]/div[4]/div[3]/div[1]/div[1]/div/div/div[2]/div[1]/div[1]/ul/li[1]/span/span/div/img');
const src = await el.getProperty('src');
const imgURL = await src.jsonValue();
const [el2] = await page.$x('/html/body/div[1]/div[2]/div[9]/div[4]/div[4]/div[1]/div/h1/span');
const txt = await el2.getProperty('textContent');
const title = await txt.jsonValue();
const [el3] = await page.$x('/html/body/div[1]/div[2]/div[9]/div[4]/div[4]/div[10]/div[1]/div/table/tbody/tr[2]/td[2]/span[1]');
const txt2 = await el3.getProperty('textContent');
const price = await txt2.jsonValue();
console.log({
imgURL,
title,
price
});
browser.close();
}
scrapeProduct('https://www.amazon.com/Business-Microphone-Upgraded-NexiGo-Computer/dp/B08BHX7GYY/?_encoding=UTF8&smid=A1HNC035CZ2MR5&pd_rd_w=GsaOJ&pf_rd_p=45f0d3b0-8ddc-4840-9ac2-c26f2608345f&pf_rd_r=A1TQ15FXBKJH1JWYXXAD&pd_rd_r=82f7f31d-db1c-4831-96a3-bb110b1133f9&pd_rd_wg=urW4C&ref_=pd_gw_unk');
// This example uses axios, cheerio & Express
const PORT = 8000;
const axios = require('axios');
const cheerio = require('cheerio');
const express = require('express');
const app = express();
const url = 'https://www.theguardian.com/uk';
axios(url)
.then(response => {
const html = response.data
const $ = cheerio.load(html)
const articles = [];
$('.fc-sublink__title', html).each(function() {
const title = $(this).text();
const url = $(this).find('a').attr('href');
articles.push({
title,
url
})
})
console.log(articles)
}).catch(err => console.log(err))
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`))
// For JavaScript running in the NodeJS environment:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch( { headless: false } )
const page = await browser.newPage()
await page.goto("https://bbc.com")
const data = await page.evaluate(() => {
// Enter JavaScript to run on the page here!
})
})()
const http = require('http');
const req = http.request('http://example.com', res => {
const data = [];
res.on('data', _ => data.push(_))
res.on('end', () => console.log(data.join()))
});
req.end();
//For any beginer looking for how to scrape website content this link explains well.
//webscrapingapi.com/the-ultimate-guide-to-web-scraping-with-javascript-and-node-js/
// 73 & have a good day.
const http = require('http');
const PORT = 3000;
const server = http.createServer((req, res) => {
res.statusCode = 200;
res.setHeader('Content-Type', 'text/plain');
res.end('Hello World');
});
server.listen(port, () => {
console.log(`Server running at PORT:${port}/`);
});
web scraping