Last updated: May 29, 2025

Efficient file deduplication with sha-256 and Node.js

Tim Koschützki

Co-founder · Berlin, Germany · Show bio ·

Duplicate files can quickly become a headache in modern applications—eating up storage and slowing queries. In this DevTip we will build a small Node.js server that rejects duplicate uploads by calculating a SHA-256 hash for every incoming file and comparing it against previously-seen ones.

Why not MD5?

MD5 used to be the go-to algorithm for quick integrity checks, but it is no longer considered collision-resistant. For a deduplication system you usually do not need full cryptographic strength, yet picking a broken hash means accepting a (small, but real) risk of two different files producing the same digest. SHA-256 is still fast, ships with Node.js, and eliminates that concern—so that is what we will use throughout the article.

Understand content-based deduplication

Content-based deduplication, a form of content addressing, identifies identical files by their binary content instead of their filenames or metadata. The workflow is simple:

Receive an upload.
Stream the bytes through a hash function.
Look up that hash in a datastore.
Reject, reuse, or store the file depending on whether the hash already exists.

Because hashes have a fixed length (256 bits for SHA-256) you can use them as lightweight primary keys regardless of the original file size.

Set up the project

mkdir file-deduplication && cd $_
npm init -y
npm install express@4.19.2 multer@1.4.5-lts.1 sqlite3@5.1.7

We will keep extra dependencies to a minimum—express for routing, multer for multipart parsing, and sqlite3 for a tiny persistent database. path and crypto come with Node.js.

Create a secure upload handler

// index.js
const express = require('express')
const multer = require('multer')
const path = require('path')
const fs = require('fs')
const crypto = require('crypto')
const sqlite3 = require('sqlite3').verbose()

const app = express()

/* -------------------------------------------------------------------------- */
/* Database                                                                   */
/* -------------------------------------------------------------------------- */
const db = new sqlite3.Database('deduplication.db')

db.exec(`CREATE TABLE IF NOT EXISTS files (
  hash TEXT PRIMARY KEY,
  original_name TEXT,
  filename TEXT,
  file_path TEXT,
  size INTEGER,
  upload_date TEXT
)`)

/* -------------------------------------------------------------------------- */
/* Multer configuration                                                       */
/* -------------------------------------------------------------------------- */
const storage = multer.diskStorage({
  destination(req, file, cb) {
    const dir = 'uploads'
    fs.mkdirSync(dir, { recursive: true })
    cb(null, dir)
  },
  filename(req, file, cb) {
    const unique = Date.now() + '-' + Math.round(Math.random() * 1e9)
    cb(null, `${file.fieldname}-${unique}${path.extname(file.originalname)}`)
  },
})

const MAX_SIZE = 10 * 1024 * 1024 // 10 MB

const upload = multer({
  storage,
  limits: { fileSize: MAX_SIZE, files: 1 },
  fileFilter(req, file, cb) {
    const allowed = ['image/jpeg', 'image/png', 'image/gif', 'application/pdf']
    cb(null, allowed.includes(file.mimetype))
  },
})

/* -------------------------------------------------------------------------- */
/* Helper: stream hashing                                                     */
/* -------------------------------------------------------------------------- */
function hashFile(filePath) {
  return new Promise((resolve, reject) => {
    const hash = crypto.createHash('sha256')
    const stream = fs.createReadStream(filePath)

    stream.on('data', (chunk) => hash.update(chunk))
    stream.on('end', () => resolve(hash.digest('hex')))
    stream.on('error', reject)
  })
}

/* -------------------------------------------------------------------------- */
/* Routes                                                                     */
/* -------------------------------------------------------------------------- */
app.post('/upload', (req, res) => {
  upload.single('file')(req, res, async (err) => {
    if (err instanceof multer.MulterError) {
      return res.status(400).json({ error: err.message })
    }
    if (err) {
      return res.status(500).json({ error: err.message })
    }
    if (!req.file) {
      return res.status(400).json({ error: 'No file uploaded' })
    }

    const { path: filePath, originalname, filename, size } = req.file

    try {
      const digest = await hashFile(filePath)

      db.get('SELECT * FROM files WHERE hash = ?', [digest], (dbErr, row) => {
        if (dbErr) {
          fs.unlinkSync(filePath)
          return res.status(500).json({ error: dbErr.message })
        }

        if (row) {
          // Duplicate: remove new file and return existing info
          fs.unlinkSync(filePath)
          return res.status(409).json({
            error: 'Duplicate file detected',
            hash: digest,
            originalName: row.original_name,
          })
        }

        // Store metadata for future checks
        db.run(
          `INSERT INTO files (hash, original_name, filename, file_path, size, upload_date)
           VALUES (?, ?, ?, ?, ?, datetime('now'))`,
          [digest, originalname, filename, filePath, size],
          (insertErr) => {
            if (insertErr) {
              fs.unlinkSync(filePath)
              return res.status(500).json({ error: insertErr.message })
            }

            return res.json({
              message: 'File uploaded successfully',
              hash: digest,
              size,
            })
          },
        )
      })
    } catch (hashErr) {
      fs.unlinkSync(filePath)
      res.status(500).json({ error: hashErr.message })
    }
  })
})

app.get('/files', (req, res) => {
  db.all('SELECT original_name as name, hash, size, upload_date FROM files', (err, rows) => {
    if (err) return res.status(500).json({ error: err.message })
    res.json(rows)
  })
})

/* -------------------------------------------------------------------------- */
/* Start server                                                               */
/* -------------------------------------------------------------------------- */
const PORT = process.env.PORT || 3000
app.listen(PORT, () => {
  console.log(`Server listening on http://localhost:${PORT}`)
})

Handle large files efficiently

hashFile already streams data, so memory usage stays constant regardless of file size. When you expect gigabyte-scale uploads, move the deduplication work into a background queue—otherwise even a streaming hash will briefly block the event loop while SQLite writes to disk. Tools like BullMQ or Nest.js Queues make that straightforward.

Performance, storage, and security tips

Add an index on the hash column for faster look-ups when you migrate to PostgreSQL or another full-fledged database.
Keep your upload directory outside the web root and back it up separately from your metadata database.
Cache recently used hashes in Redis when your workload involves many near-identical files (for example user avatars).
Sanitize filenames to prevent path traversal vulnerabilities if you use user-provided names directly in file system operations (though our example uses a generated filename).
Regularly update dependencies to patch known vulnerabilities.

Demo interface (optional)

For brevity we skipped the HTML demo here, but you can keep the one from the previous version of this post—it will still work with the new backend.

Wrap-up

A hash-based approach to deduplication is compact, accurate, and easy to bolt onto any existing upload endpoint. By switching to SHA-256 and streaming file reads you get collision resistance and constant memory usage with only a handful of lines of code.

If you would rather offload hashing entirely, our 🤖 /file/hash Robot in the Media Cataloging service can generate SHA-256 (and many other) checksums for you—no server maintenance required.

#node-js #file-deduplication #sha-256 #crypto-module #storage-optimization #media-cataloging-service