You've successfully subscribed to MyPad Blog
Great! Next, complete checkout for full access to MyPad Blog
Welcome back! You've successfully signed in.
Success! Your account is fully activated, you now have access to all content.
Success! Your billing info is updated.
Billing info update failed.

Extract data from an AWS Lambda function and write it to an AWS S3 bucket

Extract data from an AWS Lambda function and write it to an AWS S3 bucket

In a recent project I needed to extract data using a lambda function.

The data we needed for the project is mined from various sets of the web-pages. The project involves capital markets data.

To meet our clients' mandate we needed to make the process extremely fast, reliable and modularized.

We have had a very good experience of using AWS lambda, using puppeteer in headless mode to extract the data from the services we mined.

Below is some sample code to explain the process. The data is parsed in the lambda function and written to an AWS bucket as JSON files.

The parameters related to authorizing the lambda function to write to s3 are stored as environment variables of the lambda function. This can be done manually or using the serveless framework

The credentials can also can be encrypted as a best practice. The lambda function is authorize using the environment variables as shown the code and thereafter it can write to any particular s3 bucket as authorized using the authentication keys.

The performance of the above idea has been very good and the entire experience is pretty neat.

'use strict';

const _ = require('underscore');
var async = require("async");

var AWS = require('aws-sdk');

        "accessKeyId": process.env.AWSAccessKeyId,
        "secretAccessKey": process.env.AWSSecretAccessKey,
        "region": process.env.AWSRegion

var s3 = new AWS.S3();

async function getData(page, url) {
    console.log("Get data for " + url);
    try {
        await page.goto(url, { waitUntil: 'networkidle2', timeout: 500 }).catch(() => {
            console.log("Error caught");

        return await page.evaluate(() => {
            var entries = new Array();
            // FILL UP THE ENTRIES

            var results = new Object();
   = entries;

            return results;
    } catch (error) {
        await browser.close();

exports.handler = async (event, context) => {
    console.log(`Going to start the handler`);

    const UserAgent = require('user-agents');
    const chromium = require('chrome-aws-lambda');

    let results = null;
    let browser = null;
    let items = new Array();

    try {
        const userAgent = new UserAgent({ deviceCategory: 'desktop' }).toString();

        // console.dir(chromium.args, null, true);

        browser = await chromium.puppeteer.launch({
            args: chromium.args,
            defaultViewport: chromium.defaultViewport,
            executablePath: await chromium.executablePath,
            headless: chromium.headless,

        const page = await browser.newPage();
        await page.setViewport({ width: 1900, height: 1024 });
        await intercept(page);

        results = await getData(page, event.url);
        items = items.concat(;
    } catch (error) {
    } finally {
        if (browser !== null) {
            await browser.close();

    console.log(`Finished extracting ${items.length} items`);
    var bucketName = process.env.AWSBucket;
    var keyName = 'PROJECT_NAME/data-file.json';

    var params = { Bucket: bucketName, Key: keyName, Body: JSON.stringify(items) };

    s3.putObject(params, function (err, data) {
        if (err)
            console.log("Successfully saved object to " + bucketName + "/" + keyName);

    return items;