In a recent project I needed to extract data using a lambda function.

The data we needed for the project is mined from various sets of the web-pages. The project involves capital markets data.

To meet our clients' mandate we needed to make the process extremely fast, reliable and modularized.

We have had a very good experience of using AWS lambda, using puppeteer in headless mode to extract the data from the services we mined.

Below is some sample code to explain the process. The data is parsed in the lambda function and written to an AWS bucket as JSON files.

The parameters related to authorizing the lambda function to write to s3 are stored as environment variables of the lambda function. This can be done manually or using the serveless framework

The credentials can also can be encrypted as a best practice. The lambda function is authorize using the environment variables as shown the code and thereafter it can write to any particular s3 bucket as authorized using the authentication keys.

The performance of the above idea has been very good and the entire experience is pretty neat.

'use strict';

const _ = require('underscore');
var async = require("async");


// https://faragta.com/aws-lambda/write-file-to-s3.html
var AWS = require('aws-sdk');

// https://stackoverflow.com/questions/37096719/cant-set-aws-credentials-in-nodejs
// https://stackoverflow.com/a/46241489/644081
AWS.config.update(
    {
        "accessKeyId": process.env.AWSAccessKeyId,
        "secretAccessKey": process.env.AWSSecretAccessKey,
        "region": process.env.AWSRegion
    }
);

var s3 = new AWS.S3();

async function getData(page, url) {
    console.log("Get data for " + url);
    try {
        await page.goto(url, { waitUntil: 'networkidle2', timeout: 500 }).catch(() => {
            console.log("Error caught");
        });

        return await page.evaluate(() => {
            var entries = new Array();
            
            // SAMPLE PLACEHOLDER
            // FILL UP THE ENTRIES

            var results = new Object();
            results.data = entries;

            return results;
        });
    } catch (error) {
        console.log(error);
        await browser.close();
        cb();
    }
}

exports.handler = async (event, context) => {
    console.log(`Going to start the handler`);

    const UserAgent = require('user-agents');
    const chromium = require('chrome-aws-lambda');

    let results = null;
    let browser = null;
    let items = new Array();

    try {
        const userAgent = new UserAgent({ deviceCategory: 'desktop' }).toString();
        chromium.args.push(`--user-agent=${userAgent}`);

        // console.dir(chromium.args, null, true);

        browser = await chromium.puppeteer.launch({
            args: chromium.args,
            defaultViewport: chromium.defaultViewport,
            executablePath: await chromium.executablePath,
            headless: chromium.headless,
        });

        const page = await browser.newPage();
        await page.setViewport({ width: 1900, height: 1024 });
        await intercept(page);

        results = await getData(page, event.url);
        items = items.concat(results.data);
    } catch (error) {
        console.log(error);
        return context.fail(error);
    } finally {
        if (browser !== null) {
            await browser.close();
        }
    }

    console.log(`Finished extracting ${items.length} items`);
    
    // https://faragta.com/aws-lambda/write-file-to-s3.html
    var bucketName = process.env.AWSBucket;
    var keyName = 'PROJECT_NAME/data-file.json';

    var params = { Bucket: bucketName, Key: keyName, Body: JSON.stringify(items) };

    s3.putObject(params, function (err, data) {
        if (err)
            console.log(err)
        else
            console.log("Successfully saved object to " + bucketName + "/" + keyName);
    });

    return items;
};