diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm index 129574c073..2005653c95 100644 --- a/guix/store/deduplication.scm +++ b/guix/store/deduplication.scm @@ -1,6 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2017 Caleb Ristvedt -;;; Copyright © 2018-2022 Ludovic Courtès +;;; Copyright © 2018-2022, 2024 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -28,6 +28,7 @@ #:use-module (srfi srfi-11) #:use-module (srfi srfi-34) #:use-module (srfi srfi-35) + #:use-module (rnrs bytevectors) #:use-module (rnrs io ports) #:use-module (ice-9 match) #:use-module (guix serialization) @@ -206,6 +207,48 @@ under STORE." #f) (else (apply throw args))))))))))) +(define (hole-size bv start size) + "Return a lower bound of the number of leading zeros in the first SIZE bytes +of BV, starting at offset START." + (let ((end (+ start size))) + (let loop ((offset start)) + (if (> offset (- end 4)) + (- offset start) + (if (zero? (bytevector-u32-native-ref bv offset)) + (loop (+ offset 4)) + (- offset start)))))) + +(define (find-holes bv start size) + "Return the list of offset/size pairs representing \"holes\" (sequences of +zeros) in the SIZE bytes starting at START in BV." + (define granularity + ;; Disk block size is traditionally 512 bytes; focus on larger holes to + ;; reduce the computational effort. + 1024) + + (define (align offset) + (match (modulo offset granularity) + (0 offset) + (mod (+ offset (- granularity mod))))) + + (define end + (+ start size)) + + (let loop ((offset start) + (size size) + (holes '())) + (if (>= offset end) + (reverse! holes) + (let ((hole (hole-size bv offset size))) + (if (and hole (>= hole granularity)) + (let ((next (align (+ offset hole)))) + (loop next + (- size (- next offset)) + (cons (cons offset hole) holes))) + (loop (+ offset granularity) + (- size granularity) + holes)))))) + (define (tee input len output) "Return a port that reads up to LEN bytes from INPUT and writes them to OUTPUT as it goes." @@ -217,6 +260,10 @@ OUTPUT as it goes." (&nar-error (port input) (file (port-filename output)))))) + (define seekable? + ;; Whether OUTPUT can be a sparse file. + (file-port? output)) + (define (read! bv start count) ;; Read at most LEN bytes in total. (let ((count (min count (- len bytes-read)))) @@ -229,7 +276,35 @@ OUTPUT as it goes." ;; Do not return zero since zero means EOF, so try again. (loop (get-bytevector-n! input bv start count))) (else - (put-bytevector output bv start ret) + (if seekable? + ;; Render long-enough sequences of zeros as "holes". + (match (find-holes bv start ret) + (() + (put-bytevector output bv start ret)) + (holes + (let loop ((offset start) + (size ret) + (holes holes)) + (match holes + (() + (if (> size 0) + (put-bytevector output bv offset size) + (when (= len (+ bytes-read ret)) + ;; We created a hole in OUTPUT by seeking + ;; forward but that hole only comes into + ;; existence if we write something after it. + ;; Make the hole one byte smaller and write a + ;; final zero. + (seek output -1 SEEK_CUR) + (put-u8 output 0)))) + (((hole-start . hole-size) . rest) + (let ((prefix-len (- hole-start offset))) + (put-bytevector output bv offset prefix-len) + (seek output hole-size SEEK_CUR) + (loop (+ hole-start hole-size) + (- size prefix-len hole-size) + rest))))))) + (put-bytevector output bv start ret)) (set! bytes-read (+ bytes-read ret)) ret))))) diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm index f1845035d8..f116ff9834 100644 --- a/tests/store-deduplication.scm +++ b/tests/store-deduplication.scm @@ -1,5 +1,5 @@ ;;; GNU Guix --- Functional package management for GNU -;;; Copyright © 2018, 2020-2022 Ludovic Courtès +;;; Copyright © 2018, 2020-2022, 2024 Ludovic Courtès ;;; ;;; This file is part of GNU Guix. ;;; @@ -24,10 +24,27 @@ #:use-module (guix build utils) #:use-module (rnrs bytevectors) #:use-module (ice-9 binary-ports) + #:use-module (ice-9 match) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) #:use-module (srfi srfi-64)) +(define (cartesian-product . lst) + "Return the Cartesian product of all the given lists." + (match lst + ((head) + (map list head)) + ((head . rest) + (let ((others (apply cartesian-product rest))) + (append-map (lambda (init) + (map (lambda (lst) + (cons init lst)) + others)) + head))) + (() + '()))) + + (test-begin "store-deduplication") (test-equal "deduplicate, below %deduplication-minimum-size" @@ -166,4 +183,43 @@ (cut string-append store <>)) '("/a" "/b" "/c")))))))) +(for-each (match-lambda + ((initial-gap middle-gap final-gap) + (test-assert + (format #f "copy-file/deduplicate, sparse files (holes: ~a/~a/~a)" + initial-gap middle-gap final-gap) + (call-with-temporary-directory + (lambda (store) + (let ((source (string-append store "/source"))) + (call-with-output-file source + (lambda (port) + (seek port initial-gap SEEK_CUR) + (display "hi!" port) + (seek port middle-gap SEEK_CUR) + (display "bye." port) + (when (> final-gap 0) + (seek port (- final-gap 1) SEEK_CUR) + (put-u8 port 0)))) + + (for-each (lambda (target) + (copy-file/deduplicate source + (string-append store target) + #:store store)) + '("/a" "/b" "/c")) + (system* "du" "-h" source) + (system* "du" "-h" "--apparent-size" source) + (system* "du" "-h" (string-append store "/a")) + (system* "du" "-h" "--apparent-size" (string-append store "/a")) + (and (directory-exists? (string-append store "/.links")) + (file=? source (string-append store "/a")) + (apply = (map (compose stat:ino stat + (cut string-append store <>)) + '("/a" "/b" "/c"))) + (let ((st (pk 'S (stat (string-append store "/a"))))) + (<= (* 512 (stat:blocks st)) + (stat:size st)))))))))) + (cartesian-product '(0 3333 8192) + '(8192 9999 16384 22222) + '(0 8192))) + (test-end "store-deduplication")