grafts: Support rewriting UTF-16 and UTF-32 store references.
Partially fixes <https://bugs.gnu.org/33848>. * guix/build/graft.scm (replace-store-references): Add support for finding and rewriting UTF-16 and UTF-32 store references. * tests/grafts.scm: Add tests.master
parent
abf032c131
commit
1bab9b9f17
|
@ -1,6 +1,6 @@
|
||||||
;;; GNU Guix --- Functional package management for GNU
|
;;; GNU Guix --- Functional package management for GNU
|
||||||
;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
|
;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
|
||||||
;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
|
;;; Copyright © 2016, 2021 Mark H Weaver <mhw@netris.org>
|
||||||
;;;
|
;;;
|
||||||
;;; This file is part of GNU Guix.
|
;;; This file is part of GNU Guix.
|
||||||
;;;
|
;;;
|
||||||
|
@ -55,6 +55,52 @@
|
||||||
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
|
(string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
|
||||||
<>))
|
<>))
|
||||||
|
|
||||||
|
(define (nix-base32-char-or-nul? c)
|
||||||
|
"Return true if C is a nix-base32 character or NUL, otherwise return false."
|
||||||
|
(or (nix-base32-char? c)
|
||||||
|
(char=? c #\nul)))
|
||||||
|
|
||||||
|
(define (possible-utf16-hash? buffer i w)
|
||||||
|
"Return true if (I - W) is large enough to hold a UTF-16 encoded
|
||||||
|
nix-base32 hash and if BUFFER contains NULs in all positions where NULs
|
||||||
|
are to be expected in a UTF-16 encoded hash+dash pattern whose dash is
|
||||||
|
found at position I. Otherwise, return false."
|
||||||
|
(and (<= (* 2 hash-length) (- i w))
|
||||||
|
(let loop ((j (+ 1 (- i (* 2 hash-length)))))
|
||||||
|
(or (>= j i)
|
||||||
|
(and (zero? (bytevector-u8-ref buffer j))
|
||||||
|
(loop (+ j 2)))))))
|
||||||
|
|
||||||
|
(define (possible-utf32-hash? buffer i w)
|
||||||
|
"Return true if (I - W) is large enough to hold a UTF-32 encoded
|
||||||
|
nix-base32 hash and if BUFFER contains NULs in all positions where NULs
|
||||||
|
are to be expected in a UTF-32 encoded hash+dash pattern whose dash is
|
||||||
|
found at position I. Otherwise, return false."
|
||||||
|
(and (<= (* 4 hash-length) (- i w))
|
||||||
|
(let loop ((j (+ 1 (- i (* 4 hash-length)))))
|
||||||
|
(or (>= j i)
|
||||||
|
(and (zero? (bytevector-u8-ref buffer j))
|
||||||
|
(zero? (bytevector-u8-ref buffer (+ j 1)))
|
||||||
|
(zero? (bytevector-u8-ref buffer (+ j 2)))
|
||||||
|
(loop (+ j 4)))))))
|
||||||
|
|
||||||
|
(define (insert-nuls char-size bv)
|
||||||
|
"Given a bytevector BV, return a bytevector containing the same bytes but
|
||||||
|
with (CHAR-SIZE - 1) NULs inserted between every two adjacent bytes from BV.
|
||||||
|
For example, (insert-nuls 4 #u8(1 2 3)) => #u8(1 0 0 0 2 0 0 0 3)."
|
||||||
|
(if (= char-size 1)
|
||||||
|
bv
|
||||||
|
(let* ((len (bytevector-length bv))
|
||||||
|
(bv* (make-bytevector (+ 1 (* char-size
|
||||||
|
(- len 1)))
|
||||||
|
0)))
|
||||||
|
(let loop ((i 0))
|
||||||
|
(when (< i len)
|
||||||
|
(bytevector-u8-set! bv* (* i char-size)
|
||||||
|
(bytevector-u8-ref bv i))
|
||||||
|
(loop (+ i 1))))
|
||||||
|
bv*)))
|
||||||
|
|
||||||
(define* (replace-store-references input output replacement-table
|
(define* (replace-store-references input output replacement-table
|
||||||
#:optional (store (%store-directory)))
|
#:optional (store (%store-directory)))
|
||||||
"Read data from INPUT, replacing store references according to
|
"Read data from INPUT, replacing store references according to
|
||||||
|
@ -76,9 +122,9 @@ bytevectors to the same value."
|
||||||
(list->vector (map pred (iota 256)))
|
(list->vector (map pred (iota 256)))
|
||||||
<>))
|
<>))
|
||||||
|
|
||||||
(define nix-base32-byte?
|
(define nix-base32-byte-or-nul?
|
||||||
(optimize-u8-predicate
|
(optimize-u8-predicate
|
||||||
(compose nix-base32-char?
|
(compose nix-base32-char-or-nul?
|
||||||
integer->char)))
|
integer->char)))
|
||||||
|
|
||||||
(define (dash? byte) (= byte 45))
|
(define (dash? byte) (= byte 45))
|
||||||
|
@ -86,100 +132,153 @@ bytevectors to the same value."
|
||||||
(define request-size (expt 2 20)) ; 1 MiB
|
(define request-size (expt 2 20)) ; 1 MiB
|
||||||
|
|
||||||
;; We scan the file for the following 33-byte pattern: 32 bytes of
|
;; We scan the file for the following 33-byte pattern: 32 bytes of
|
||||||
;; nix-base32 characters followed by a dash. To accommodate large files,
|
;; nix-base32 characters followed by a dash. When we find such a pattern
|
||||||
;; we do not read the entire file, but instead work on buffers of up to
|
;; whose hash is in REPLACEMENT-TABLE, we perform the required rewrite and
|
||||||
;; 'request-size' bytes. To ensure that every 33-byte sequence appears
|
;; continue scanning.
|
||||||
;; entirely within exactly one buffer, adjacent buffers must overlap,
|
;;
|
||||||
;; i.e. they must share 32 byte positions. We accomplish this by
|
;; To support UTF-16 and UTF-32 store references, the 33 bytes comprising
|
||||||
;; "ungetting" the last 32 bytes of each buffer before reading the next
|
;; this hash+dash pattern may optionally be interspersed by extra NUL bytes.
|
||||||
;; buffer, unless we know that we've reached the end-of-file.
|
;; This simple approach works because the characters we are looking for are
|
||||||
|
;; restricted to ASCII. UTF-16 hashes are interspersed with single NUL
|
||||||
|
;; bytes ("\0"), and UTF-32 hashes are interspersed with triplets of NULs
|
||||||
|
;; ("\0\0\0"). Note that we require NULs to be present only *between* the
|
||||||
|
;; other bytes, and not at either end, in order to be insensitive to byte
|
||||||
|
;; order.
|
||||||
|
;;
|
||||||
|
;; To accommodate large files, we do not read the entire file at once, but
|
||||||
|
;; instead work on buffers of up to REQUEST-SIZE bytes. To ensure that
|
||||||
|
;; every hash+dash pattern appears in its entirety in at least one buffer,
|
||||||
|
;; adjacent buffers must overlap by one byte less than the maximum size of a
|
||||||
|
;; hash+dash pattern. We accomplish this by "ungetting" a suffix of each
|
||||||
|
;; buffer before reading the next buffer, unless we know that we've reached
|
||||||
|
;; the end-of-file.
|
||||||
(let ((buffer (make-bytevector request-size)))
|
(let ((buffer (make-bytevector request-size)))
|
||||||
(let loop ()
|
(define-syntax-rule (byte-at i)
|
||||||
;; Note: We avoid 'get-bytevector-n' to work around
|
(bytevector-u8-ref buffer i))
|
||||||
;; <http://bugs.gnu.org/17466>.
|
(let outer-loop ()
|
||||||
(match (get-bytevector-n! input buffer 0 request-size)
|
(match (get-bytevector-n! input buffer 0 request-size)
|
||||||
((? eof-object?) 'done)
|
((? eof-object?) 'done)
|
||||||
(end
|
(end
|
||||||
;; We scan the buffer for dashes that might be preceded by a
|
(define (scan-from i w)
|
||||||
;; nix-base32 hash. The key optimization here is that whenever we
|
;; Scan the buffer for dashes that might be preceded by nix hashes,
|
||||||
;; find a NON-nix-base32 character at position 'i', we know that it
|
;; where I is the minimum position where such a dash might be
|
||||||
;; cannot be part of a hash, so the earliest position where the next
|
;; found, and W is the number of bytes in the buffer that have been
|
||||||
;; hash could start is i+1 with the following dash at position i+33.
|
;; written so far. We assume that I - W >= HASH-LENGTH.
|
||||||
;;
|
;;
|
||||||
;; Since nix-base32 characters comprise only 1/8 of the 256 possible
|
;; The key optimization here is that whenever we find a byte at
|
||||||
;; byte values, and exclude some of the most common letters in
|
;; position I that cannot occur within a nix hash (because it's
|
||||||
;; English text (e t o u), in practice we can advance by 33 positions
|
;; neither a nix-base32 character nor NUL), we can infer that the
|
||||||
;; most of the time.
|
;; earliest position where the next hash could start is at I + 1,
|
||||||
(let scan-from ((i hash-length) (written 0))
|
;; and therefore the earliest position for the following dash is
|
||||||
;; 'i' is the first position where we look for a dash. 'written'
|
;; (+ I 1 HASH-LENGTH), which is I + 33.
|
||||||
;; is the number of bytes in the buffer that have already been
|
;;
|
||||||
;; written.
|
;; Since nix-base32-or-nul characters comprise only about 1/8 of
|
||||||
|
;; the 256 possible byte values, and exclude some of the most
|
||||||
|
;; common letters in English text (e t o u), we can advance 33
|
||||||
|
;; positions much of the time.
|
||||||
(if (< i end)
|
(if (< i end)
|
||||||
(let ((byte (bytevector-u8-ref buffer i)))
|
(let ((byte (byte-at i)))
|
||||||
(cond ((and (dash? byte)
|
(cond ((dash? byte)
|
||||||
;; We've found a dash. Note that we do not know
|
(found-dash i w))
|
||||||
;; whether the preceeding 32 bytes are nix-base32
|
((nix-base32-byte-or-nul? byte)
|
||||||
;; characters, but we do not need to know. If
|
(scan-from (+ i 1) w))
|
||||||
;; they are not, the following lookup will fail.
|
(else
|
||||||
(lookup-replacement
|
(not-part-of-hash i w))))
|
||||||
(string-tabulate (lambda (j)
|
(finish-buffer i w)))
|
||||||
|
|
||||||
|
(define (not-part-of-hash i w)
|
||||||
|
;; Position I is known to not be within a nix hash that we must
|
||||||
|
;; rewrite. Therefore, the earliest position where the next hash
|
||||||
|
;; might start is I + 1, and therefore the earliest position of
|
||||||
|
;; the following dash is (+ I 1 HASH-LENGTH).
|
||||||
|
(scan-from (+ i 1 hash-length) w))
|
||||||
|
|
||||||
|
(define (found-dash i w)
|
||||||
|
;; We know that there is a dash '-' at position I, and that
|
||||||
|
;; I - W >= HASH-LENGTH. The immediately preceding bytes *might*
|
||||||
|
;; contain a nix-base32 hash, but that is not yet known. Here,
|
||||||
|
;; we rule out all but one possible encoding (ASCII, UTF-16,
|
||||||
|
;; UTF-32) by counting how many NULs precede the dash.
|
||||||
|
(cond ((not (zero? (byte-at (- i 1))))
|
||||||
|
;; The dash is *not* preceded by a NUL, therefore it
|
||||||
|
;; cannot possibly be a UTF-16 or UTF-32 hash. Proceed
|
||||||
|
;; to check for an ASCII hash.
|
||||||
|
(found-possible-hash 1 i w))
|
||||||
|
|
||||||
|
((not (zero? (byte-at (- i 2))))
|
||||||
|
;; The dash is preceded by exactly one NUL, therefore it
|
||||||
|
;; cannot be an ASCII or UTF-32 hash. Proceed to check
|
||||||
|
;; for a UTF-16 hash.
|
||||||
|
(if (possible-utf16-hash? buffer i w)
|
||||||
|
(found-possible-hash 2 i w)
|
||||||
|
(not-part-of-hash i w)))
|
||||||
|
|
||||||
|
(else
|
||||||
|
;; The dash is preceded by at least two NULs, therefore
|
||||||
|
;; it cannot be an ASCII or UTF-16 hash. Proceed to
|
||||||
|
;; check for a UTF-32 hash.
|
||||||
|
(if (possible-utf32-hash? buffer i w)
|
||||||
|
(found-possible-hash 4 i w)
|
||||||
|
(not-part-of-hash i w)))))
|
||||||
|
|
||||||
|
(define (found-possible-hash char-size i w)
|
||||||
|
;; We know that there is a dash '-' at position I, that
|
||||||
|
;; I - W >= CHAR-SIZE * HASH-LENGTH, and that the only
|
||||||
|
;; possible encoding for the preceding hash is as indicated by
|
||||||
|
;; CHAR-SIZE. Here we check to see if the given hash is in
|
||||||
|
;; REPLACEMENT-TABLE, and if so, we perform the required
|
||||||
|
;; rewrite.
|
||||||
|
(let* ((hash (string-tabulate
|
||||||
|
(lambda (j)
|
||||||
(integer->char
|
(integer->char
|
||||||
(bytevector-u8-ref buffer
|
(byte-at (- i (* char-size
|
||||||
(+ j (- i hash-length)))))
|
(- hash-length j))))))
|
||||||
hash-length)))
|
hash-length))
|
||||||
=> (lambda (replacement)
|
(replacement* (lookup-replacement hash))
|
||||||
|
(replacement (and replacement*
|
||||||
|
(insert-nuls char-size replacement*))))
|
||||||
|
(cond
|
||||||
|
((not replacement)
|
||||||
|
(not-part-of-hash i w))
|
||||||
|
(else
|
||||||
;; We've found a hash that needs to be replaced.
|
;; We've found a hash that needs to be replaced.
|
||||||
;; First, write out all bytes preceding the hash
|
;; First, write out all bytes preceding the hash
|
||||||
;; that have not yet been written.
|
;; that have not yet been written.
|
||||||
(put-bytevector output buffer written
|
(put-bytevector output buffer w
|
||||||
(- i hash-length written))
|
(- i (* char-size hash-length) w))
|
||||||
;; Now write the replacement string.
|
;; Now write the replacement string.
|
||||||
(put-bytevector output replacement)
|
(put-bytevector output replacement)
|
||||||
;; Since the byte at position 'i' is a dash,
|
;; Now compute the new values of W and I and continue.
|
||||||
;; which is not a nix-base32 char, the earliest
|
(let ((w (+ (- i (* char-size hash-length))
|
||||||
;; position where the next hash might start is
|
(bytevector-length replacement))))
|
||||||
;; i+1, and the earliest position where the
|
(scan-from (+ w hash-length) w))))))
|
||||||
;; following dash might start is (+ i 1
|
|
||||||
;; hash-length). Also, increase the write
|
|
||||||
;; position to account for REPLACEMENT.
|
|
||||||
(let ((len (bytevector-length replacement)))
|
|
||||||
(scan-from (+ i 1 len)
|
|
||||||
(+ i (- len hash-length))))))
|
|
||||||
;; If the byte at position 'i' is a nix-base32 char,
|
|
||||||
;; then the dash we're looking for might be as early as
|
|
||||||
;; the following byte, so we can only advance by 1.
|
|
||||||
((nix-base32-byte? byte)
|
|
||||||
(scan-from (+ i 1) written))
|
|
||||||
;; If the byte at position 'i' is NOT a nix-base32
|
|
||||||
;; char, then the earliest position where the next hash
|
|
||||||
;; might start is i+1, with the following dash at
|
|
||||||
;; position (+ i 1 hash-length).
|
|
||||||
(else
|
|
||||||
(scan-from (+ i 1 hash-length) written))))
|
|
||||||
|
|
||||||
;; We have finished scanning the buffer. Now we determine how
|
(define (finish-buffer i w)
|
||||||
;; many bytes have not yet been written, and how many bytes to
|
;; We have finished scanning the buffer. Now we determine how many
|
||||||
;; "unget". If 'end' is less than 'request-size' then we read
|
;; bytes have not yet been written, and how many bytes to "unget".
|
||||||
;; less than we asked for, which indicates that we are at EOF,
|
;; If END is less than REQUEST-SIZE then we read less than we asked
|
||||||
;; so we needn't unget anything. Otherwise, we unget up to
|
;; for, which indicates that we are at EOF, so we needn't unget
|
||||||
;; 'hash-length' bytes (32 bytes). However, we must be careful
|
;; anything. Otherwise, we unget up to (* 4 HASH-LENGTH) bytes.
|
||||||
;; not to unget bytes that have already been written, because
|
;; However, we must be careful not to unget bytes that have already
|
||||||
;; that would cause them to be written again from the next
|
;; been written, because that would cause them to be written again
|
||||||
;; buffer. In practice, this case occurs when a replacement is
|
;; from the next buffer. In practice, this case occurs when a
|
||||||
;; made near or beyond the end of the buffer. When REPLACEMENT
|
;; replacement is made near or beyond the end of the buffer. When
|
||||||
;; went beyond END, we consume the extra bytes from INPUT.
|
;; REPLACEMENT went beyond END, we consume the extra bytes from
|
||||||
(begin
|
;; INPUT.
|
||||||
(if (> written end)
|
(if (> w end)
|
||||||
(get-bytevector-n! input buffer 0 (- written end))
|
(get-bytevector-n! input buffer 0 (- w end))
|
||||||
(let* ((unwritten (- end written))
|
(let* ((unwritten (- end w))
|
||||||
(unget-size (if (= end request-size)
|
(unget-size (if (= end request-size)
|
||||||
(min hash-length unwritten)
|
(min (* 4 hash-length)
|
||||||
|
unwritten)
|
||||||
0))
|
0))
|
||||||
(write-size (- unwritten unget-size)))
|
(write-size (- unwritten unget-size)))
|
||||||
(put-bytevector output buffer written write-size)
|
(put-bytevector output buffer w write-size)
|
||||||
(unget-bytevector input buffer (+ written write-size)
|
(unget-bytevector input buffer (+ w write-size)
|
||||||
unget-size)))
|
unget-size)))
|
||||||
(loop)))))))))
|
(outer-loop))
|
||||||
|
|
||||||
|
(scan-from hash-length 0))))))
|
||||||
|
|
||||||
(define (rename-matching-files directory mapping)
|
(define (rename-matching-files directory mapping)
|
||||||
"Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
|
"Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
;;; GNU Guix --- Functional package management for GNU
|
;;; GNU Guix --- Functional package management for GNU
|
||||||
;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
|
;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
|
||||||
|
;;; Copyright © 2021 Mark H Weaver <mhw@netris.org>
|
||||||
;;;
|
;;;
|
||||||
;;; This file is part of GNU Guix.
|
;;; This file is part of GNU Guix.
|
||||||
;;;
|
;;;
|
||||||
|
@ -468,4 +469,86 @@
|
||||||
replacement
|
replacement
|
||||||
"/gnu/store")))))
|
"/gnu/store")))))
|
||||||
|
|
||||||
|
(define (insert-nuls char-size str)
|
||||||
|
(string-join (map string (string->list str))
|
||||||
|
(make-string (- char-size 1) #\nul)))
|
||||||
|
|
||||||
|
(define (nuls-to-underscores s)
|
||||||
|
(string-replace-substring s "\0" "_"))
|
||||||
|
|
||||||
|
(define (annotate-buffer-boundary s)
|
||||||
|
(string-append (string-take s buffer-size)
|
||||||
|
"|"
|
||||||
|
(string-drop s buffer-size)))
|
||||||
|
|
||||||
|
(define (abbreviate-leading-fill s)
|
||||||
|
(let ((s* (string-trim s #\=)))
|
||||||
|
(format #f "[~a =s]~a"
|
||||||
|
(- (string-length s)
|
||||||
|
(string-length s*))
|
||||||
|
s*)))
|
||||||
|
|
||||||
|
(define (prettify-for-display s)
|
||||||
|
(abbreviate-leading-fill
|
||||||
|
(annotate-buffer-boundary
|
||||||
|
(nuls-to-underscores s))))
|
||||||
|
|
||||||
|
(define (two-sample-refs-with-gap char-size1 char-size2 gap offset
|
||||||
|
char1 name1 char2 name2)
|
||||||
|
(string-append
|
||||||
|
(make-string (- buffer-size offset) #\=)
|
||||||
|
(insert-nuls char-size1
|
||||||
|
(string-append "/gnu/store/" (make-string 32 char1) name1))
|
||||||
|
gap
|
||||||
|
(insert-nuls char-size2
|
||||||
|
(string-append "/gnu/store/" (make-string 32 char2) name2))
|
||||||
|
(list->string (map integer->char (iota 77 33)))))
|
||||||
|
|
||||||
|
(define (sample-map-entry old-char new-char new-name)
|
||||||
|
(cons (make-string 32 old-char)
|
||||||
|
(string->utf8 (string-append (make-string 32 new-char)
|
||||||
|
new-name))))
|
||||||
|
|
||||||
|
(define (test-two-refs-with-gap char-size1 char-size2 gap offset)
|
||||||
|
(test-equal
|
||||||
|
(format #f "test-two-refs-with-gap, char-sizes ~a ~a, gap ~s, offset ~a"
|
||||||
|
char-size1 char-size2 gap offset)
|
||||||
|
(prettify-for-display
|
||||||
|
(two-sample-refs-with-gap char-size1 char-size2 gap offset
|
||||||
|
#\6 "-BlahBlaH"
|
||||||
|
#\8"-SoMeTHiNG"))
|
||||||
|
(prettify-for-display
|
||||||
|
(let* ((content (two-sample-refs-with-gap char-size1 char-size2 gap offset
|
||||||
|
#\5 "-blahblah"
|
||||||
|
#\7 "-something"))
|
||||||
|
(replacement (alist->vhash
|
||||||
|
(list (sample-map-entry #\5 #\6 "-BlahBlaH")
|
||||||
|
(sample-map-entry #\7 #\8 "-SoMeTHiNG")))))
|
||||||
|
(call-with-output-string
|
||||||
|
(lambda (output)
|
||||||
|
((@@ (guix build graft) replace-store-references)
|
||||||
|
(open-input-string content) output
|
||||||
|
replacement
|
||||||
|
"/gnu/store")))))))
|
||||||
|
|
||||||
|
(for-each (lambda (char-size1)
|
||||||
|
(for-each (lambda (char-size2)
|
||||||
|
(for-each (lambda (gap)
|
||||||
|
(for-each (lambda (offset)
|
||||||
|
(test-two-refs-with-gap char-size1
|
||||||
|
char-size2
|
||||||
|
gap
|
||||||
|
offset))
|
||||||
|
;; offsets to test
|
||||||
|
(map (lambda (i)
|
||||||
|
(+ i (* 40 char-size1)))
|
||||||
|
(iota 30))))
|
||||||
|
;; gaps
|
||||||
|
'("" "-" " " "a")))
|
||||||
|
;; char-size2 values to test
|
||||||
|
'(1 2)))
|
||||||
|
;; char-size1 values to test
|
||||||
|
'(1 2 4))
|
||||||
|
|
||||||
|
|
||||||
(test-end)
|
(test-end)
|
||||||
|
|
Reference in New Issue