NodeJS
A Retailer creates, receives, and extracts Tasks, and you can use any programming language to implement Retailer RESTFul API.
Before you continue, please do follow works first:
- 1.
- 2.
- 3.
This tutorial shows you how to create a Retailer to implement Crawl Example Blog step by step. Normally, you can use bitskyai/bitsky-hello-retailer to getting started.
Create a directory to hold your Retailer, and make that as your working directory.
$ mkdir crawlExampleBlogs
$ cd crawlExampleBlogs
Use the
npm init
command to create a package.json
file for your application. For more information on how package.json
works, see Specifics of npm’s package.json handling.$ npm init
This command prompts you for a number of things, such as the name and version of your application. For now, you can simply hit RETURN to accept the defaults for most of them.
Now install @bitskyai/retailer-sdk in the
crawlExampleBlogs
directory and save it in the dependencies. For example:$ npm install @bitskyai/retailer-sdk --save
Inside
crawlExampleBlogs
, create index.js
, server.js and worker.js. const { startServer } = require("./server");
startServer();
// Add @bitskyai/retailer-sdk - https://www.npmjs.com/package/@bitskyai/retailer-sdk
const baseRetailerService = require("@bitskyai/retailer-sdk");
const path = require("path");
//------------------------------------------------------------------------------------------
const { settings, trigger, parse } = require("./worker");
module.exports = {
// DON'T change and remove startServer
startServer: async function startServer(customConfig) {
// Based on baseRetailerService APIs(https://www.npmjs.com/package/@bitskyai/retailer-sdk) to change
try {
baseRetailerService.express();
baseRetailerService.routers();
await baseRetailerService.listen();
baseRetailerService.logger.info(`start server successful`, {
configs: baseRetailerService.getConfigs()
});
return baseRetailerService;
} catch (err) {
baseRetailerService.logger.error(`startServer fail - ${err.message}`, {
error: err
});
throw err;
}
},
// DON'T change and remove stopServer
stopServer: async function stopServer() {
//--------------------------------------------
// Normally you don't need to change the code inside try/catch, but you still can change it if you need
// Based on baseRetailerService APIs(https://www.npmjs.com/package/@bitskyai/retailer-sdk) to change
try {
await baseRetailerService.stop();
} catch (err) {
throw err;
}
},
};
// https://apis.bitsky.ai/bitsky-retailer-sdk/BaseRetailerService.html
const baseRetailerService = require("@bitskyai/retailer-sdk");
//--------------------------------------------------------------
// Following are frequently use packages you possible need
// All the list packages are already pre-installed
//
// Available Packages:
// 1. https://docs.bitsky.ai/user-manual/retailer-editor/node-modules-full-list
// 2. https://nodejs.org/dist/latest-v12.x/docs/api/
//--------------------------------------------------------------
const path = require("path");
// `cheerio`: Fast, flexible & lean implementation of core jQuery designed specifically for the server
// https://www.npmjs.com/package/cheerio
const cheerio = require("cheerio");
// `lodash`: A modern JavaScript utility library delivering modularity, performance & extras
// https://lodash.com/
const _ = require("lodash");
// You MUST change to correct Retailer Configuration Global ID
// https://docs.bitsky.ai/how-tos/how-to-get-global-id#get-a-retailer-configuration-global-id
const settings = {
GLOBAL_ID: process.env.GLOBAL_ID || "6e56474d-0c75-4125-b5a8-27b0ccf71390",
// CONNECTOR_TYPE: "mongodb", // Save data to mongoDB
// MONGODB_URL: "mongodb://localhost:27017/helloretailer", // MongoDB URL
};
// Page will wait 5 second, this is to show you how to execute JavaScript inside page
// For more infomation, please take a look of `metadata.scripts` in https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#Task
async function customFunction() {
await $$page.waitFor(5 * 1000);
}
//========================================================================
// You can read https://docs.bitsky.ai/tutorials/crawl-example-blog to get detail understand what is the requirement of this example
//========================================================================
/**
* Trigger is used for init first task/tasks for your data scrawling job.
* **Supplier** based on Task information to decide when to assign it to suitable **Producer** to exectue.
* After **Producer** successfully execute Task, will send Task back to **parse** function.
* It is the **enter point**, similar to the `main` function in Java, C/C++
* For more information, please take a look of https://apis.bitsky.ai/bitsky-retailer-sdk/BaseRetailerService.html#trigger
*
* @returns {object} - A JSON object has tasks property. Normally you can use `baseRetailerService.generateTask` to generate Task.
* Detail information: https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#TriggerFunReturn
*/
const trigger = async function trigger({ req, res }) {
return {
};
};
/**
* After **Producer** successfully execute Task, parse function will be called. And receive the **Task** contains crawled data.
* Parse is used for extract data and decide whether contine to add more tasks.
*
* For example, in **trigger** we create a task to crawl http://exampleblog.bitsky.ai/, after **Producer** crawled successful, will send back Task that contains the HTML of http://exampleblog.bitsky.ai/
* And inside **parse** function, we parse return HTML, and get URL link of each blog, and create tasks to continue crawl each blog
*
* @returns {object} - https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#ParseFunReturn
*/
const parse = async function parse({ req, res }) {
try {
// return data that need to store and tasks need to be executed
// Check https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#ParseFunReturn for more detail
return {
data: [],
tasks: [],
};
} catch (err) {
console.error(`parse error: ${err.message}`);
}
};
module.exports = {
settings,
trigger,
parse,
};
This is folder structure should look like:
crawlExampleBlogs/
node_modules/ # All the node_modules
index.js # Main enter file
package-lock.json # package-lock.json is automatically generated for any operations where npm modifies either the node_modules tree, or package.json.
package.json # Holds various metadata relevant to the project, like node_modules
server.js # Start or Stop Retailer server.
worker.js #
Run the Retailer with the following command:
$ node index.js
Producer server listening on http://localhost:8081/ in development mode
start server successful {
configs: {
PORT: 8081,
BITSKY_BASE_URL: 'http://localhost:9099',
GLOBAL_ID: undefined,
SERVICE_NAME: '@bitskyai/retailer-sdk',
NODE_ENV: undefined,
LOG_FILES_PATH: '/Users/neo/Downloads/crawlExampleBlogs/node_modules/@bitskyai/retailer-sdk/lib/public/log',
ERROR_LOG_FILE_NAME: 'error.log',
COMBINED_LOG_FILE_NAME: 'combined.log',
LOG_LEVEL: 'info',
DATA_PATH: '/Users/neo/Downloads/crawlExampleBlogs/node_modules/@bitskyai/retailer-sdk/lib/public/data.json',
CONNECTOR_TYPE: 'json',
MONGODB_URL: 'mongodb://localhost:27017/@bitskyai/retailer-sdk',
MONGODB_HOST: undefined,
MONGODB_NAME: undefined,
MONGODB_PORT: undefined,
MONGODB_USERNAME: undefined,
MONGODB_PASSWORD: undefined
}
}

Congratulations, you successfully created your first Retailer service. Now let us implement extract data logic and configure this Retailer.
@bitskyai/retailer-sdk implement Retailer RESTFul API, and you just need to implement the extract data logic.
// https://apis.bitsky.ai/bitsky-retailer-sdk/BaseRetailerService.html
const baseRetailerService = require("@bitskyai/retailer-sdk");
//--------------------------------------------------------------
// Following are frequently use packages you possible need
// All the list packages are already pre-installed
//
// Available Packages:
// 1. https://docs.bitsky.ai/user-manual/retailer-editor/node-modules-full-list
// 2. https://nodejs.org/dist/latest-v12.x/docs/api/
//--------------------------------------------------------------
const path = require("path");
// `cheerio`: Fast, flexible & lean implementation of core jQuery designed specifically for the server
// https://www.npmjs.com/package/cheerio
const cheerio = require("cheerio");
// `lodash`: A modern JavaScript utility library delivering modularity, performance & extras
// https://lodash.com/
const _ = require("lodash");
// Check full configuration - https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#Configurations
const settings = {
SERVICE_NAME: "hello-retailer-service",
BITSKY_BASE_URL: "http://localhost:9099",
// You MUST change to correct Retailer Configuration Global ID
// https://docs.bitsky.ai/how-tos/how-to-get-global-id#get-a-retailer-configuration-global-id
GLOBAL_ID: process.env.GLOBAL_ID || "6e56474d-0c75-4125-b5a8-27b0ccf71390",
// CONNECTOR_TYPE: "mongodb", // Save data to mongoDB
// MONGODB_URL: "mongodb://localhost:27017/helloretailer", // MongoDB URL
};
// Page will wait 5 second, this is to show you how to execute JavaScript inside page
// For more infomation, please take a look of `metadata.scripts` in https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#Task
async function customFunction() {
await $$page.waitFor(5 * 1000);
}
//========================================================================
// You can read https://docs.bitsky.ai/tutorials/crawl-example-blog to get detail understand what is the requirement of this example
//========================================================================
/**
* Trigger is used for init first task/tasks for your data scrawling job.
* **Supplier** based on Task information to decide when to assign it to suitable **Producer** to exectue.
* After **Producer** successfully execute Task, will send Task back to **parse** function.
* It is the **enter point**, similar to the `main` function in Java, C/C++
* For more information, please take a look of https://apis.bitsky.ai/bitsky-retailer-sdk/BaseRetailerService.html#trigger
*
* @returns {object} - A JSON object has tasks property. Normally you can use `baseRetailerService.generateTask` to generate Task.
* Detail information: https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#TriggerFunReturn
*/
const trigger = async function trigger({ req, res }) {
return {
tasks: [
// API: https://apis.bitsky.ai/bitsky-retailer-sdk/BaseRetailerService.html#generateTask
baseRetailerService.generateTask({
// Target website URL
url: "http://exampleblog.bitsky.ai/",
// Priority of this task. This is useful if your tasks need to be executed by order. `1` is highest priority
priority: 1,
// Additional metadata for this task, you should add it based your requirement. `script` is preserved, it only used for pass JavaScript Code String
// In this example, I use `type` to distinguish different page - `bloglist` or `blog`.
// If it is `bloglist` then get all blog links and add new tasks to continues crawl those blogs, otherwise save blog to JSON
//
// In this example, I let page to wait 5 second, this isn't necessary, only used for show you how to execute JavaScript Code.
// `script` is useful to crawl single page application or you need to interact with page. And only `Headless Producer` can execute tasks have script
// `script` is the JavaScript Code you want to execute, you need to convert your function to string. Normally you can use `functionName.toString()`
metadata: { type: "bloglist", script: customFunction.toString() },
}),
],
};
};
/**
* After **Producer** successfully execute Task, parse function will be called. And receive the **Task** contains crawled data.
* Parse is used for extract data and decide whether contine to add more tasks.
*
* For example, in **trigger** we create a task to crawl http://exampleblog.bitsky.ai/, after **Producer** crawled successful, will send back Task that contains the HTML of http://exampleblog.bitsky.ai/
* And inside **parse** function, we parse return HTML, and get URL link of each blog, and create tasks to continue crawl each blog
*
* @returns {object} - https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#ParseFunReturn
*/
const parse = async function parse({ req, res }) {
try {
// Task return from Producer, Task Schema - https://github.com/bitskyai/bitsky-supplier/blob/develop/src/schemas/task.json
// By default, crawled HTML was stored in task.dataset.data.content
const returnTasks = req.body;
// New Tasks that need to be sent to BitSky Supplier
const tasks = [];
// Crawled Data, by default will be stored in local disk
const storeData = [];
// Base URL for the new Task
const targetBaseURL = "http://exampleblog.bitsky.ai/";
for (let i = 0; i < returnTasks.length; i++) {
let task = returnTasks[i];
// Crawled HTML - https://github.com/bitskyai/bitsky-supplier/blob/develop/src/schemas/task.json
let htmlString = task.dataset.data.content;
// You can find how to use cheerio from https://cheerio.js.org/
// cheerio: Fast, flexible & lean implementation of core jQuery designed specifically for the server.
// if you like you also can try to use `xpath`, please check https://www.npmjs.com/package/xpath
let $ = cheerio.load(htmlString);
if (task.metadata.type == "bloglist") {
// If task type is **bloglist**, then need to get blog link
// Get more detail from https://docs.bitsky.ai/tutorials/crawl-example-blog#crawl-each-blog-list-page-and-get-blogs-link
let blogUrls = $("div.post-preview a");
for (let i = 0; i < blogUrls.length; i++) {
let $blog = blogUrls[i];
$blog = $($blog);
// Get blog page link, don't forget to add Base URL
let url = new URL($blog.attr("href"), targetBaseURL).toString();
// you can use `logger.info`, `logger.error` for debug
// please check https://www.npmjs.com/package/winston for detail
baseRetailerService.logger.info(`blog page link: ${url}`);
// Add Task to crawl blog page
tasks.push(
baseRetailerService.generateTask({
url,
// Set `priority` to `2`, so we can first crawl all blog list page, then crawl all blogs
priority: 2,
metadata: {
// Add `type: "blog"` to indicate this task is for crawl blog
type: "blog",
},
})
);
}
// Get next blog list page link
let nextUrl = $("ul.pager li.next a").attr("href");
if (nextUrl) {
nextUrl = new URL(nextUrl, targetBaseURL).toString();
baseRetailerService.logger.info(`blog list page link: ${nextUrl}`);
// If it has next blog list page, then create a Task to crawl Next Blog List page
tasks.push(
baseRetailerService.generateTask({
url: nextUrl,
// blog list page is highest priority
priority: 1,
metadata: {
// indicate this task is for crawl blog list page
type: "bloglist",
// Just to show you how to execute JavaScript in the browser
script: customFunction.toString(),
},
})
);
}
} else if (task.metadata.type == "blog") {
// If it is blog page, then crawl data and put to
storeData.push({
title: $("div.post-heading h1").text(),
author: $("div.post-heading p.meta span.author").text(),
date: $("div.post-heading p.meta span.date").text(),
content: $("div.post-container div.post-content").text(),
url: task.dataset.url,
});
} else {
baseRetailerService.logger.error("unknown type");
}
}
// return data that need to store and tasks need to be executed
// Check https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#ParseFunReturn for more detail
return {
data: storeData,
tasks: tasks,
};
} catch (err) {
baseRetailerService.logger.error(`parse error: ${err.message}`);
}
};
module.exports = {
settings,
trigger,
parse,
};
Let us take a look at each part.
This is the settings of your Retailer. e.g. Retailer Configuration Global ID, CONNECTOR_TYPE, and configuration for the connector.
For now, we have two connector -
json
and mongodb
. By default, it uses json
, you can change to mongodb
and provide MONGODB_URL
trigger
is used for init first task/tasks for your data scrawling job. To crawl https://exampleblog.bitsky.ai/, we add the first Task to crawl blog list page, then in the parse
function based on crawled data decide to add Tasks to crawl blog list page, crawl blog page or save crawled blog data.After Producers successfully execute Tasks,
parse
function will be called. And receive the Tasks contain crawled data, check Task Schema.By default,
task.dataset.data.content
is HTML String, so we can use cheerio to parse HTML String, then use CSS Selector to extract data from HTML String. For example:let $ = cheerio.load(htmlString);
let blogUrls = $("div.post-preview a");
It gets the following 3 items

Then you can use
blogUrls[0].attr("href")
to get the URL for each blog, use baseRetailerService.generateTask
to generate a Task to crawl the blog page.tasks.push(
baseRetailerService.generateTask({
url,
// Set `priority` to `2`, so we can first crawl all blog list page, then crawl all blogs
priority: 2,
metadata: {
// Add `type: "blog"` to indicate this task is for crawl blog
type: "blog",
},
})
);
And if has the next blog list page, also generate a Task to crawl the blog list page.
tasks.push(
baseRetailerService.generateTask({
url: nextUrl,
// blog list page is highest priority
priority: 1,
metadata: {
// indicate this task is for crawl blog list page
type: "bloglist",
// Just to show you how to execute JavaScript in the browser
script: customFunction.toString(),
},
})
);
customFunction.toString()
is only used to show you how to execute custom JavaScript in the browser page, check more detail about How to execute JavaScript in a TaskIf Task's type is
blog
, then extract data and save to disk.storeData.push({
title: $("div.post-heading h1").text(),
author: $("div.post-heading p.meta span.author").text(),
date: $("div.post-heading p.meta span.date").text(),
content: $("div.post-container div.post-content").text(),
url: task.dataset.url,
});
At the end of this
parse
function, return data
that needs to store, and tasks
need to add to BitSky. Check more detail from https://apis.bitsky.ai/bitsky-retailer-sdk/global.html#ParseFunReturnreturn {
data: storeData,
tasks: tasks,
}
// Add @bitskyai/retailer-sdk - https://www.npmjs.com/package/@bitskyai/retailer-sdk
const baseRetailerService = require("@bitskyai/retailer-sdk");
const path = require("path");
//------------------------------------------------------------------------------------------
const { settings, trigger, parse } = require("./worker");
module.exports = {
// DON'T change and remove startServer
startServer: async function startServer() {
// Based on baseRetailerService APIs(https://www.npmjs.com/package/@bitskyai/retailer-sdk) to change
try {
// environment variable will overwrite settings
baseRetailerService.setConfigs(settings);
baseRetailerService.trigger(trigger);
baseRetailerService.parse(parse);
baseRetailerService.express();
baseRetailerService.routers();
await baseRetailerService.listen();
baseRetailerService.logger.info(`start server successful`, {
configs: baseRetailerService.getConfigs()
});
return baseRetailerService;
} catch (err) {
baseRetailerService.logger.error(`startServer fail - ${err.message}`, {
error: err
});
throw err;
}
},
// DON'T change and remove stopServer
stopServer: async function stopServer() {
//--------------------------------------------
// Normally you don't need to change the code inside try/catch, but you still can change it if you need
// Based on baseRetailerService APIs(https://www.npmjs.com/package/@bitskyai/retailer-sdk) to change
try {
await baseRetailerService.stop();
} catch (err) {
throw err;
}
},
};
Now you only need to restart your Retailer. Stop previous
node index.js
, and re-run it again.Last modified 2yr ago