Continuous integration systems generate an enormous amount of log data, making efficient log archiving essential. Node.js streams and tar libraries offer a powerful solution for processing and archiving logs in real-time, minimizing disk I/O and improving performance.

Dependencies

Before we begin, ensure you have Node.js >= 14.0.0 installed. Create a new project and install the required dependencies:

npm init -y
npm install tar-stream@3.1.7

Creating a real-time log archive

The following example demonstrates how to create an in-memory tar archive using streams. This approach is particularly useful in CI environments where logs are generated continuously:

const tar = require('tar-stream')
const { Readable } = require('stream')

async function addLogToArchive(pack, filename, content) {
  return new Promise((resolve, reject) => {
    pack.entry({ name: filename }, content, (err) => {
      if (err) reject(err)
      else resolve()
    })
  })
}

async function createLogArchive(logs) {
  const pack = tar.pack()

  pack.on('error', (err) => {
    console.error('Tar pack error:', err)
    throw err
  })

  try {
    for (const log of logs) {
      await addLogToArchive(pack, log.filename, log.content)
    }
    pack.finalize()
    return pack
  } catch (error) {
    throw new Error(`Failed to create archive: ${error.message}`)
  }
}

// Example usage with error handling
async function archiveLogs() {
  const logs = [
    {
      filename: 'build.log',
      content: `Build started at ${new Date().toISOString()}
Installing dependencies...
Build completed successfully.`,
    },
    {
      filename: 'test.log',
      content: `Test suite initiated.
All tests passed without errors.
Execution time: 12s.`,
    },
  ]

  try {
    const archiveStream = await createLogArchive(logs)
    return archiveStream
  } catch (error) {
    console.error('Failed to create archive:', error)
    throw error
  }
}

Performance considerations

When working with tar archives in Node.js, keep these performance optimizations in mind:

  • Use streams for large files to minimize memory usage
  • Consider using worker threads for compression in high-throughput scenarios
  • Implement backpressure handling for large datasets
  • Use appropriate buffer sizes when dealing with binary data

Security considerations

When working with tar archives, implement these security measures:

const path = require('path')

function validateFilename(filename) {
  const normalizedPath = path.normalize(filename)
  if (normalizedPath.startsWith('../') || path.isAbsolute(normalizedPath)) {
    throw new Error('Invalid filename: Path traversal attempt detected')
  }
  return normalizedPath
}

async function addLogToArchive(pack, filename, content) {
  const safeFilename = validateFilename(filename)
  return new Promise((resolve, reject) => {
    pack.entry({ name: safeFilename }, content, (err) => {
      if (err) reject(err)
      else resolve()
    })
  })
}

Handling different file types

When archiving various file types, you'll need to handle them appropriately:

async function addFileToArchive(pack, filename, content, encoding = 'utf8') {
  const options = {
    name: filename,
    // Set appropriate mode for executable files
    mode: filename.endsWith('.sh') ? 0o755 : 0o644,
  }

  return new Promise((resolve, reject) => {
    // Handle Buffer input for binary files
    const data = Buffer.isBuffer(content) ? content : Buffer.from(content, encoding)
    pack.entry(options, data, (err) => {
      if (err) reject(err)
      else resolve()
    })
  })
}

Troubleshooting common issues

Here are solutions to common challenges when working with tar streams:

  1. Memory leaks:
const stream = require('stream')
const { promisify } = require('util')
const pipeline = promisify(stream.pipeline)

async function handleArchiveStream(archiveStream, outputStream) {
  try {
    await pipeline(archiveStream, outputStream)
  } catch (error) {
    console.error('Stream processing failed:', error)
    throw error
  }
}
  1. Incomplete archives:
function createArchiveWithTimeout(logs, timeoutMs = 30000) {
  return new Promise((resolve, reject) => {
    const timeout = setTimeout(() => {
      reject(new Error('Archive creation timed out'))
    }, timeoutMs)

    createLogArchive(logs)
      .then((archive) => {
        clearTimeout(timeout)
        resolve(archive)
      })
      .catch(reject)
  })
}

Conclusion

Node.js streams combined with tar libraries provide an efficient solution for real-time log archiving in CI pipelines. This approach minimizes disk I/O and enhances performance while maintaining flexibility for various use cases. By implementing proper error handling and security measures, you can build robust and reliable archiving systems for your development workflow.

Happy coding!