How to download and unzip a zip file in memory in NodeJs?


Question

I want to download a zip file from the internet and unzip it in memory without saving to a temporary file. How can I do this?

Here is what I tried:

var url = 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip';

var request = require('request'), fs = require('fs'), zlib = require('zlib');

  request.get(url, function(err, res, file) {
     if(err) throw err;
     zlib.unzip(file, function(err, txt) {
        if(err) throw err;
        console.log(txt.toString()); //outputs nothing
     });
  });

[EDIT] As, suggested, I tried using the adm-zip library and I still cannot make this work:

var ZipEntry = require('adm-zip/zipEntry');
request.get(url, function(err, res, zipFile) {
        if(err) throw err;
        var zip = new ZipEntry();
        zip.setCompressedData(new Buffer(zipFile.toString('utf-8')));
        var text = zip.getData();
        console.log(text.toString()); // fails
    });
1
42
4/30/2012 8:21:42 PM

Accepted Answer

You need a library that can handle buffers. The latest version of adm-zip will do:

npm install adm-zip

My solution uses the http.get method, since it returns Buffer chunks.

Code:

var file_url = 'http://notepad-plus-plus.org/repository/7.x/7.6/npp.7.6.bin.x64.zip';

var AdmZip = require('adm-zip');
var http = require('http');

http.get(file_url, function(res) {
  var data = [], dataLen = 0; 

  res.on('data', function(chunk) {
    data.push(chunk);
    dataLen += chunk.length;

  }).on('end', function() {
    var buf = Buffer.alloc(dataLen);

    for (var i = 0, len = data.length, pos = 0; i < len; i++) { 
      data[i].copy(buf, pos); 
      pos += data[i].length; 
    } 

    var zip = new AdmZip(buf);
    var zipEntries = zip.getEntries();
    console.log(zipEntries.length)

    for (var i = 0; i < zipEntries.length; i++) {
      if (zipEntries[i].entryName.match(/readme/))
        console.log(zip.readAsText(zipEntries[i]));
    }
  });
});

The idea is to create an array of buffers and concatenate them into a new one at the end. This is due to the fact that buffers cannot be resized.

Update

This is a simpler solution that uses the request module to obtain the response in a buffer, by setting encoding: null in the options. It also follows redirects and resolves http/https automatically.

var file_url = 'https://github.com/mihaifm/linq/releases/download/3.1.1/linq.js-3.1.1.zip';

var AdmZip = require('adm-zip');
var request = require('request');

request.get({url: file_url, encoding: null}, (err, res, body) => {
  var zip = new AdmZip(body);
  var zipEntries = zip.getEntries();
  console.log(zipEntries.length);

  zipEntries.forEach((entry) => {
    if (entry.entryName.match(/readme/i))
      console.log(zip.readAsText(entry));
  });
});

The body of the response is a buffer that can be passed directly to AdmZip, simplifying the whole process.

67
11/22/2018 8:03:41 AM

Sadly you can't pipe the response stream into the unzip job as node zlib lib allows you to do, you have to cache and wait the end of the response. I suggest you to pipe the response to a fs stream in case of big files, otherwise you will full fill your memory in a blink!

I don't completely understand what you are trying to do, but imho this is the best approach. You should keep your data in memory only the time you really need it, and then stream to the csv parser.

If you want to keep all your data in memory you can replace the csv parser method fromPath with from that takes a buffer instead and in getData return directly unzipped

You can use the AMDZip (as @mihai said) instead of node-zip, just pay attention because AMDZip is not yet published in npm so you need:

$ npm install git://github.com/cthackers/adm-zip.git

N.B. Assumption: the zip file contains only one file

var request = require('request'),
    fs = require('fs'),
    csv = require('csv')
    NodeZip = require('node-zip')

function getData(tmpFolder, url, callback) {
  var tempZipFilePath = tmpFolder + new Date().getTime() + Math.random()
  var tempZipFileStream = fs.createWriteStream(tempZipFilePath)
  request.get({
    url: url,
    encoding: null
  }).on('end', function() {
    fs.readFile(tempZipFilePath, 'base64', function (err, zipContent) {
      var zip = new NodeZip(zipContent, { base64: true })
      Object.keys(zip.files).forEach(function (filename) {
        var tempFilePath = tmpFolder + new Date().getTime() + Math.random()
        var unzipped = zip.files[filename].data
        fs.writeFile(tempFilePath, unzipped, function (err) {
          callback(err, tempFilePath)
        })
      })
    })
  }).pipe(tempZipFileStream)
}

getData('/tmp/', 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip', function (err, path) {
  if (err) {
    return console.error('error: %s' + err.message)
  }
  var metadata = []
  csv().fromPath(path, {
    delimiter: '|',
    columns: true
  }).transform(function (data){
    // do things with your data
    if (data.NAME[0] === '#') {
      metadata.push(data.NAME)
    } else {
      return data
    }
  }).on('data', function (data, index) {
    console.log('#%d %s', index, JSON.stringify(data, null, '  '))
  }).on('end',function (count) {
    console.log('Metadata: %s', JSON.stringify(metadata, null, '  '))
    console.log('Number of lines: %d', count)
  }).on('error', function (error) {
    console.error('csv parsing error: %s', error.message)
  })
})

Licensed under: CC-BY-SA with attribution
Not affiliated with: Stack Overflow
Icon