libzypp  17.31.31
mediablocklist.cc
Go to the documentation of this file.
1 /*---------------------------------------------------------------------\
2 | ____ _ __ __ ___ |
3 | |__ / \ / / . \ . \ |
4 | / / \ V /| _/ _/ |
5 | / /__ | | | | | | |
6 | /_____||_| |_| |_| |
7 | |
8 \---------------------------------------------------------------------*/
101 #include "mediablocklist.h"
102 
103 #include <sys/types.h>
104 #include <stdio.h>
105 #include <stdlib.h>
106 #include <string.h>
107 
108 #include <vector>
109 #include <iostream>
110 #include <fstream>
111 
112 #include <zypp-core/base/Logger.h>
113 #include <zypp-core/base/String.h>
114 #include <zypp-core/AutoDispose.h>
115 #include <zypp-core/base/Exception.h>
116 
117 using namespace zypp::base;
118 
119 namespace zypp {
120  namespace media {
121 
122  namespace {
123 
124  struct rsum {
125  unsigned short a = 0;
126  unsigned short b = 0;
127  } __attribute__((packed));
128 
129  /* rcksum_calc_rsum_block(data, data_len)
130  * Calculate the rsum for a single block of data. */
131  rsum rcksum_calc_rsum_block(const unsigned char *data, size_t len) {
132  unsigned short a = 0;
133  unsigned short b = 0;
134 
135  while (len) {
136  unsigned char c = *data++;
137  a += c;
138  b += len * c;
139  len--;
140  }
141  return rsum{ a, b };
142  }
143 
144  // update the rsum by removing the old and adding the new char
145  #define UPDATE_RSUM(a, b, oldc, newc, bshift) do { (a) += ((unsigned char)(newc)) - ((unsigned char)(oldc)); (b) += (a) - ((oldc) << (bshift)); } while (0)
146 
151  void inline truncateRsum ( unsigned int &rs, const int rsumlen )
152  {
153  switch(rsumlen)
154  {
155  case 3:
156  rs &= 0xffffff;
157  break;
158  case 2:
159  rs &= 0xffff;
160  break;
161  case 1:
162  rs &= 0xff;
163  break;
164  default:
165  break;
166  }
167  }
168 
169  }
170 
171 MediaBlockList::MediaBlockList(off_t size)
172 : filesize(size),
173  haveblocks(false),
174  chksumlen(0),
175  chksumpad(0),
176  rsumlen(0),
177  rsumseq(0),
178  rsumpad(0)
179 { }
180 
181 size_t
182 MediaBlockList::addBlock(off_t off, size_t size)
183 {
184  haveblocks = true;
185  blocks.push_back(MediaBlock( off, size ));
186  return blocks.size() - 1;
187 }
188 
189 void
190 MediaBlockList::setFileChecksum(std::string ctype, int cl, unsigned char *c)
191 {
192  if (!cl)
193  return;
194  fsumtype = ctype;
195  fsum.resize(cl);
196  memcpy(&fsum[0], c, cl);
197 }
198 
200 {
201  return fsumtype;
202 }
203 
205 {
206  return fsum;
207 }
208 
209 bool
211 {
212  return digest.create(fsumtype);
213 }
214 
215 bool
217 {
218  if (!haveFileChecksum())
219  return true;
220  std::vector<unsigned char>dig = digest.digestVector();
221  if (dig.empty() || dig.size() < fsum.size())
222  return false;
223  return memcmp(&dig[0], &fsum[0], fsum.size()) ? false : true;
224 }
225 
226 void
227 MediaBlockList::setChecksum(size_t blkno, std::string cstype, int csl, unsigned char *cs, size_t cspad)
228 {
229  if (!csl)
230  return;
231  if (!chksumlen)
232  {
233  if (blkno)
234  return;
235  chksumlen = csl;
236  chksumtype = cstype;
237  chksumpad = cspad;
238  }
239  if (csl != chksumlen || cstype != chksumtype || cspad != chksumpad || blkno != chksums.size() / chksumlen)
240  return;
241  chksums.resize(chksums.size() + csl);
242  memcpy(&chksums[csl * blkno], cs, csl);
243 }
244 
245 void
246 MediaBlockList::setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad)
247 {
248  if (!rsl)
249  return;
250  if (!rsumlen)
251  {
252  if (blkno)
253  return;
254  rsumlen = rsl;
255  rsumpad = rspad;
256  }
257  if (rsl != rsumlen || rspad != rsumpad || blkno != rsums.size())
258  return;
259  rsums.push_back(rs);
260 }
261 
262 void
264 {
265  rsumseq = seq;
266 }
267 
268 bool
270 {
271  return digest.create(chksumtype);
272 }
273 
274 bool
275 MediaBlockList::verifyDigest(size_t blkno, Digest &digest) const
276 {
277  if (!haveChecksum(blkno))
278  return true;
279  size_t size = blocks[blkno].size;
280  if (!size)
281  return true;
282  if (chksumpad > size)
283  {
284  char pad[chksumpad - size];
285  memset(pad, 0, chksumpad - size);
286  digest.update(pad, chksumpad - size);
287  }
288  std::vector<unsigned char>dig = digest.digestVector();
289  if (dig.empty() || dig.size() < size_t(chksumlen))
290  return false;
291  return memcmp(&dig[0], &chksums[chksumlen * blkno], chksumlen) ? false : true;
292 }
293 
294 unsigned int
295 MediaBlockList::updateRsum(unsigned int rs, const char* bytes, size_t len) const
296 {
297  if (!len)
298  return rs;
299 
300  unsigned short s, m;
301  s = (rs >> 16) & 65535;
302  m = rs & 65535;
303  for (; len > 0 ; len--)
304  {
305  unsigned short c = (unsigned char)*bytes++;
306  s += c;
307  m += s;
308  }
309  return (s & 65535) << 16 | (m & 65535);
310 }
311 
312 bool
313 MediaBlockList::verifyRsum(size_t blkno, unsigned int rs) const
314 {
315  if (!haveRsum(blkno))
316  return true;
317  size_t size = blocks[blkno].size;
318  if (!size)
319  return true;
320  if (rsumpad > size)
321  {
322  unsigned short s, m;
323  s = (rs >> 16) & 65535;
324  m = rs & 65535;
325  m += s * (rsumpad - size);
326  rs = (s & 65535) << 16 | (m & 65535);
327  }
328  truncateRsum( rs, rsumlen );
329  return rs == rsums[blkno];
330 }
331 
332 bool
333 MediaBlockList::checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
334 {
335  if (blkno >= blocks.size() || bufl < blocks[blkno].size)
336  return false;
337  unsigned int rs = updateRsum(0, (const char *)buf, blocks[blkno].size);
338  return verifyRsum(blkno, rs);
339 }
340 
341 bool
342 MediaBlockList::checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
343 {
344  if (blkno >= blocks.size() || bufl < blocks[blkno].size)
345  return false;
346  Digest dig;
347  if (!createDigest(dig))
348  return false;
349  dig.update((const char *)buf, blocks[blkno].size);
350  return verifyDigest(blkno, dig);
351 }
352 
354 {
355  if ( !haveChecksum(blkno) )
356  return {};
357 
358  UByteArray buf ( chksumlen, '\0' );
359  memcpy( buf.data(), chksums.data()+(chksumlen * blkno), chksumlen );
360  return buf;
361 }
362 
364 {
365  return chksumtype;
366 }
367 
369 {
370  return chksumpad;
371 }
372 
373 // specialized version of checkChecksum that can deal with a "rotated" buffer
374 bool
375 MediaBlockList::checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
376 {
377  if (blkno >= blocks.size() || bufl < blocks[blkno].size)
378  return false;
379  if (start == bufl)
380  start = 0;
381  Digest dig;
382  if (!createDigest(dig))
383  return false;
384  size_t size = blocks[blkno].size;
385  size_t len = bufl - start > size ? size : bufl - start;
386  dig.update((const char *)buf + start, len);
387  if (size > len)
388  dig.update((const char *)buf, size - len);
389  return verifyDigest(blkno, dig);
390 }
391 
392 // write block to the file. can also deal with "rotated" buffers
393 void
394 MediaBlockList::writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector<bool> &found) const
395 {
396  if (blkno >= blocks.size() || bufl < blocks[blkno].size)
397  return;
398  off_t off = blocks[blkno].off;
399  size_t size = blocks[blkno].size;
400  if (fseeko(fp, off, SEEK_SET))
401  return;
402  if (start == bufl)
403  start = 0;
404  size_t len = bufl - start > size ? size : bufl - start;
405  if (fwrite(buf + start, len, 1, fp) != 1)
406  return;
407  if (size > len && fwrite(buf, size - len, 1, fp) != 1)
408  return;
409  found[blkno] = true;
410  found[blocks.size()] = true;
411 }
412 
413 static size_t
414 fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
415 {
416  size_t l = blksize;
417  int c;
418 
419  if (pushback)
420  {
421  if (pushbackp != bp)
422  memmove(bp, pushbackp, pushback);
423  bp += pushback;
424  l -= pushback;
425  }
426  while (l)
427  {
428  c = getc(fp);
429  if (c == EOF)
430  break;
431  *bp++ = c;
432  l--;
433  }
434  if (l)
435  memset(bp, 0, l);
436  return blksize - l;
437 }
438 
439 void MediaBlockList::reuseBlocks(FILE *wfp, std::string filename)
440 {
441 
442  zypp::AutoFILE fp;
443 
444  if ( !chksumlen ) {
445  DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
446  return;
447  }
448 
449  if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
450  DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
451  return;
452  }
453 
454  size_t nblks = blocks.size();
455  std::vector<bool> found( nblks + 1 );
456  if (rsumlen && !rsums.empty()) {
457 
458  const auto rsumAMask = rsumlen < 3 ? 0 : rsumlen == 3 ? 0xff : 0xffff;
459 
460  // we are building a array of rsum structs to directly access a and b parts of the checksum
461  // we make the array bigger so that the code calculating the rsum hashes does not need to care
462  // about the array size
463  // @TODO move that to the function adding new rsums when removing the old code
464  auto zsyncRsumsData = std::make_unique<rsum[]>( rsums.size() + rsumseq );
465  auto zsyncRsums = zsyncRsumsData.get();
466  for ( std::size_t i = 0; i < rsums.size(); i++ ) {
467  const auto &rs = rsums[i];
468  unsigned short s, m;
469  s = (rs >> 16) & 65535;
470  m = rs & 65535;
471  zsyncRsums[i] = rsum{ s, m };
472  }
473 
474  // we use the same code as zsync to calc the hash
475  const auto & calc_rhash = [&]( const rsum* e ) -> unsigned {
476  unsigned h = e[0].b;
477  if ( this->rsumseq > 1 ) {
478  for ( uint i = 1; i < rsumseq; i++ ) {
479  h ^= e[i].b << 3;
480  }
481  } else {
482  h ^= ( e[0].a & rsumAMask ) << 3;
483  }
484  return h;
485  };
486 
487  size_t blksize = blocks[0].size;
488  if (nblks == 1 && rsumpad && rsumpad > blksize)
489  blksize = rsumpad;
490 
491  // create hash of checksums
492  // build the hashtable
493  uint rsumHashMask;
494  {
495  int i = 16;
496 
497  /* Try hash size of 2^i; step down the value of i until we find a good size */
498  while ((2 << (i - 1)) > nblks && i > 4)
499  i--;
500 
501  /* Allocate hash based on rsum */
502  rsumHashMask = (2 << i) - 1;
503  }
504 
505  // a array indexed via hash with lists of blocks resulting in the same rsum hash
506  auto rsumHashTableData = std::make_unique<std::vector<size_t>[]>( rsumHashMask + 1 );
507  auto rsumHashTable = rsumHashTableData.get();
508 
509  // Now fill in the hash tables.
510  for ( size_t id = 0; id < nblks; id++) {
511  const auto hash = calc_rhash( &zsyncRsums[id] );
512  auto &hashList = rsumHashTable[ hash & rsumHashMask ];
513  hashList.push_back(id);
514  }
515 
516  // we read in 16 sequences at once to speed up processing
517  constexpr auto BLOCKCNT = 16;
518 
519  // we allocate the buffer so that we always have the data to verify 16 blocks, if we need to do
520  // sequence matching we grow the buffer accordingly
521  const auto readBufSize = blksize * rsumseq * BLOCKCNT;
522 
523  // buffer thats going to hold our cached data
524  auto readBufData = std::make_unique<unsigned char[]>( readBufSize );
525  memset(readBufData.get(), 0, blksize);
526 
527  // avoid using .get() all the time
528  auto readBuf = readBufData.get();
529 
530  // our running checksums for the blocks we need to match in sequence
531  auto seqRsumsData = std::make_unique<rsum[]> ( rsumseq );
532  auto seqRsums = seqRsumsData.get();
533 
534  // use byteshift instead of multiplication if the blksize is a power of 2
535  // a value is a power of 2 if ( N & N-1 ) == 0
536  int bshift = 0; // how many bytes do we need to shift
537  if ((blksize & (blksize - 1)) == 0)
538  for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
539  ;
540 
541  bool init = true;
542  // when we are in a run of matches, we remember which block ID would need to match next in order
543  // to continue writing
544  std::optional<size_t> nextReqMatchInSequence;
545  off_t dataOffset = 0; //< Our current read offset in the buffer
546  off_t dataLen = 0; //< The length of our read buffer
547 
548  // helper lambda that follows a list of hashmap entries and tries to write those that match
549  const auto &tryWriteMatchingBlocks = [&]( const std::vector<size_t> &list, const u_char *currBuf, uint reqMatches ){
550  // the number of blocks we have transferred to the target file
551  int targetBlocksWritten = 0;
552 
553  // reset the next match hint
554  nextReqMatchInSequence.reset();
555 
556  for ( const auto blkno : list ) {
557 
558  if ( found[blkno] )
559  continue;
560 
561  const auto blockRsum = &zsyncRsums[blkno];
562 
563  uint weakMatches = 0;
564 
565  // first check only the current block, we maybe can skip checking the others
566  // if we are in a run of matches
567  if ( (seqRsums[0].a & rsumAMask) != blockRsum[0].a ||
568  seqRsums[0].b != blockRsum[0].b )
569  continue;
570 
571  weakMatches++;
572 
573  for ( uint i = 1; i < reqMatches; i++ ) {
574  if ( (seqRsums[i].a & rsumAMask) != blockRsum[i].a ||
575  seqRsums[i].b != blockRsum[i].b )
576  break;
577  weakMatches++;
578  }
579 
580  if ( weakMatches < reqMatches )
581  continue;
582 
583  // we have a weak match, now we need to calc the checksums for the blocks
584  uint realMatches = 0;
585  for( uint i = 0; i < reqMatches; i++ ) {
586  if ( !checkChecksum(blkno + i, currBuf + ( i * blksize ), blksize ) ) {
587  break;
588  }
589  realMatches++;
590  }
591 
592  // check if we have the amount of matches we need ( only 1 if we are in a block sequence )
593  if( realMatches < reqMatches )
594  continue;
595 
596  // we found blocks that match , write them to target but keep searching the hashmap
597  // in case we have redundancies
598  const auto nextPossibleMatch = blkno + realMatches;
599  if ( !found[nextPossibleMatch] )
600  nextReqMatchInSequence = nextPossibleMatch; // remember that we are currently in a run of matches, next iteration we just need to look at one block
601 
602  for( uint i = 0; i < realMatches; i++ ) {
603  writeBlock( blkno + i, wfp, currBuf + ( i * blksize ), blksize, 0, found );
604  targetBlocksWritten++;
605  }
606  }
607  return targetBlocksWritten;
608  };
609 
610  if (!rsumseq)
611  rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
612 
613  const off_t seqMatchLen = ( blksize * rsumseq ); //< how many bytes do we need to match when searching a block
614 
615  while (! feof(fp) ) {
616  if ( init ) {
617  // fill the buffer for the first time
618  dataLen = fread( readBuf, 1, readBufSize, fp );
619  init = false;
620  } else {
621  // move the remaining data to the begin and read from the file to fill up the buffer again
622  const auto remainLen = dataLen-dataOffset;
623  if ( remainLen )
624  memmove( readBuf, readBuf+dataOffset, remainLen );
625 
626  dataLen = fread( readBuf+remainLen, 1, readBufSize-remainLen, fp );
627  dataLen += remainLen;
628  dataOffset = 0;
629  }
630 
631  // if we hit eof, pad with zeros
632  if ( feof(fp) ) {
633  memset( readBuf + dataLen, 0, readBufSize - dataLen );
634  dataLen = readBufSize;
635  }
636 
637  if ( dataLen < seqMatchLen )
638  return;
639 
640  // intialize our first set of checksums
641  for( uint i = 0; i < rsumseq; i++ )
642  seqRsums[i] = rcksum_calc_rsum_block( readBuf + ( i * blksize ), blksize );
643 
644  //read over the buffer we have allocated so far
645  while ( true ) {
646 
647  if ( dataOffset + seqMatchLen > dataLen )
648  break;
649 
650  u_char *currBuf = readBuf + dataOffset;
651 
652  // the number of deltafile blocks we have matched, e.g. how much blocks
653  // can we skip forward
654  uint deltaBlocksMatched = 0;
655 
656  if ( nextReqMatchInSequence.has_value() ) {
657  if ( tryWriteMatchingBlocks( { *nextReqMatchInSequence }, currBuf, 1 ) > 0 )
658  deltaBlocksMatched = 1;
659 
660  } else {
661  const auto hash = calc_rhash( seqRsums );
662 
663  // reference to the list of blocks that share our calculated hash
664  auto &blockListForHash = rsumHashTable[ hash & rsumHashMask ];
665  if ( blockListForHash.size() ) {
666  if ( tryWriteMatchingBlocks( blockListForHash, currBuf, rsumseq ) > 0 )
667  deltaBlocksMatched = rsumseq;
668  }
669  }
670 
671  if ( deltaBlocksMatched > 0 ) {
672  // we jump forward in the buffer to after what we matched
673  dataOffset += ( deltaBlocksMatched * blksize );
674 
675  if ( dataOffset + seqMatchLen > dataLen )
676  break;
677 
678  if ( deltaBlocksMatched < rsumseq ) {
679  //@TODO move the rsums we already have
680  }
681 
682  for( uint i = 0; i < rsumseq; i++ )
683  seqRsums[i] = rcksum_calc_rsum_block( readBuf + dataOffset + ( i * blksize ), blksize );
684 
685 
686  } else {
687  // we found nothing advance the window by one byte and update the rsums
688  dataOffset++;
689  if ( dataOffset + seqMatchLen > dataLen )
690  break;
691  for ( uint i = 0; i < rsumseq; i++ ) {
692  const auto blkOff = ( i*blksize );
693  u_char oldC = (currBuf + blkOff)[0];
694  u_char newC = (currBuf + blkOff)[blksize];
695  UPDATE_RSUM( seqRsums[i].a, seqRsums[i].b, oldC, newC, bshift );
696  }
697  }
698  }
699  }
700  }
701  else if (chksumlen >= 16)
702  {
703  // dummy variant, just check the checksums
704  size_t bufl = 4096;
705  off_t off = 0;
706  auto buf = std::make_unique<unsigned char[]>( bufl );
707  for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
708  {
709  if (off > blocks[blkno].off)
710  continue;
711  size_t blksize = blocks[blkno].size;
712  if (blksize > bufl)
713  {
714  bufl = blksize;
715  buf = std::make_unique<unsigned char[]>( bufl );
716  }
717  size_t skip = blocks[blkno].off - off;
718  while (skip)
719  {
720  size_t l = skip > bufl ? bufl : skip;
721  if (fread(buf.get(), l, 1, fp) != 1)
722  break;
723  skip -= l;
724  off += l;
725  }
726  if (fread(buf.get(), blksize, 1, fp) != 1)
727  break;
728  if (checkChecksum(blkno, buf.get(), blksize))
729  writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
730  off += blksize;
731  }
732  }
733  if (!found[nblks]) {
734  DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
735  return;
736  }
737  // now throw out all of the blocks we found
738  std::vector<MediaBlock> nblocks;
739  std::vector<unsigned char> nchksums;
740  std::vector<unsigned int> nrsums;
741 
742  size_t originalSize = 0;
743  size_t newSize = 0;
744  for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
745  {
746  const auto &blk = blocks[blkno];
747  originalSize += blk.size;
748  if (!found[blkno])
749  {
750  // still need it
751  nblocks.push_back(blk);
752  newSize += blk.size;
753  if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
754  {
755  nchksums.resize(nblocks.size() * chksumlen);
756  memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
757  }
758  if (rsumlen && (blkno + 1) <= rsums.size())
759  nrsums.push_back(rsums[blkno]);
760  }
761  }
762  DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
763  << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
764  blocks = nblocks;
765  chksums = nchksums;
766  rsums = nrsums;
767 }
768 
769 void MediaBlockList::reuseBlocksOld(FILE *wfp, std::string filename)
770 {
771 
772  zypp::AutoFILE fp;
773 
774  if ( !chksumlen ) {
775  DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
776  return;
777  }
778 
779  if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
780  DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
781  return;
782  }
783  size_t nblks = blocks.size();
784  std::vector<bool> found;
785  found.resize(nblks + 1);
786  if (rsumlen && !rsums.empty())
787  {
788  size_t blksize = blocks[0].size;
789  if (nblks == 1 && rsumpad && rsumpad > blksize)
790  blksize = rsumpad;
791 
792  // create hash of checksums
793 
794  // calculate the size of the hashtable by setting
795  // all bits to 1 up the to currently set MSB
796  // if we have 00010010 we end up with 00011111
797  unsigned int hm = rsums.size() * 2;
798  while (hm & (hm - 1)) {
799  hm &= hm - 1;
800  }
801  hm = hm * 2 - 1;
802 
803  // we want at least a size if 0011 1111 1111 1111
804  if (hm < 16383)
805  hm = 16383;
806 
807  // simple hashtable of checksums
808  auto rsumHashTable = std::make_unique<unsigned int[]>( hm+1 );
809  memset(rsumHashTable.get(), 0, (hm + 1) * sizeof(unsigned int));
810 
811  // insert each rsum into the hash table
812  for (unsigned int i = 0; i < rsums.size(); i++)
813  {
814  if (blocks[i].size != blksize && (i != nblks - 1 || rsumpad != blksize))
815  continue;
816  unsigned int r = rsums[i];
817  unsigned int h = r & hm;
818  unsigned int hh = 7;
819  while (rsumHashTable[h])
820  h = (h + hh++) & hm;
821  rsumHashTable[h] = i + 1;
822  }
823 
824  // read in block by block to find matches
825  // the read buffer "buf" works like a ring buffer, means that once we are done with reading a full block
826  // and didn't find a match we start again at buf[0] , filling the buffer up again, rotating until we find
827  // a matching block. Once we find a matching block all we need to do is check if the current offset "i"
828  // is at the end of the buffer, then we can simply write the full buffer out, or if its somewhere in between
829  // then the begin of our block is buf[i+1, bufsize-1] and the end buf[0,i]
830  auto ringBuf = std::make_unique<unsigned char[]>( blksize );
831 
832  // we use a second buffer to read in the next block if we are required to match more than one block at the same time.
833  auto buf2 = std::make_unique<unsigned char[]>( blksize );
834 
835  // when we are required to match more than one block, it is read into buf2 advancing the file pointer,
836  // to make sure that we do not loose those bytes in case the match fails we remember their count and
837  // start in buf2, in the next loop those will be consumed before reading from the file again
838  size_t pushback = 0;
839  unsigned char *pushbackp = 0;
840 
841  // use byteshift instead of multiplication if the blksize is a power of 2
842  // a value is a power of 2 if ( N & N-1 ) == 0
843  int bshift = 0; // how many bytes do we need to shift
844  if ((blksize & (blksize - 1)) == 0)
845  for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
846  ;
847 
848  // a and b are the LS and MS bytes of the checksum, calculated a rolling style Adler32 checksum
849  //
850  // a(k,l) = (\sum_{i=k}^l X_i) \bmod M
851  unsigned short a, b;
852  a = b = 0;
853  memset(ringBuf.get(), 0, blksize);
854  bool eof = 0;
855  bool init = 1;
856 
857  if (!rsumseq)
858  rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
859 
860  while (!eof)
861  {
862  for (size_t i = 0; i < blksize; i++)
863  {
864  // get the next character from the file
865  // or if there are pushback chars use those
866  int c;
867  if (eof)
868  c = 0;
869  else
870  {
871  if (pushback)
872  {
873  c = *pushbackp++;
874  pushback--;
875  }
876  else
877  c = getc(fp);
878  if (c == EOF)
879  {
880  eof = true;
881  c = 0;
882  if (!i || rsumseq == 2)
883  break;
884  }
885  }
886 
887  // calculate the rsum on the fly using recurrence relations, see https://rsync.samba.org/tech_report/node3.html
888  // basically we subtract the checksum value of a byte the leaves the current block window and add the new incoming one
889  // using this trick we do not need to calculate the full block checksum
890  // the least significant part of the checksum ( lower 8 bits ) is simply the sum of all chars in the block , modulo 2^16
891  // zsync uses only a 16bit type to calculate the sums and as far as i can see does not do the modulo per block as the formula
892  // says it should, we might need to do the same
893  int oc = ringBuf[i];
894  ringBuf[i] = c;
895 
896  a += c - oc;
897 
898  // this is calculates the most significant part of the checksum, bshift should be always set since blocksize
899  // should always be a power of 2
900  if (bshift)
901  b += a - ( oc << bshift );
902  else
903  // This seems to make no sense it does not even factor in the character itself
904  b += 2 * blksize;
905 
906  if (init)
907  {
908  // continue reading bytes until we have the full block in our buffer
909  if (size_t(i) != blksize - 1)
910  continue;
911  init = 0;
912  }
913 
914  unsigned int r = ((unsigned int)a & 65535) << 16 | ((unsigned int)b & 65535);
915  truncateRsum(r, rsumlen);
916 
917  unsigned int h = r & hm;
918  unsigned int hh = 7;
919 
920  // go through our hashmap to find all the matching rsums
921  for (; rsumHashTable[h]; h = (h + hh++) & hm)
922  {
923  size_t blkno = rsumHashTable[h] - 1;
924 
925  // does the current block match?
926  if (rsums[blkno] != r)
927  continue;
928  if (found[blkno])
929  continue;
930 
931  // if we need to always match 2 blocks in sequence, get the next block
932  // and check its checksum
933  if (rsumseq == 2)
934  {
935  if (eof || blkno + 1 >= nblks)
936  continue;
937  pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
938  pushbackp = buf2.get();
939  if (!pushback)
940  continue;
941 
942  if (!checkRsum(blkno + 1, buf2.get(), blksize))
943  continue;
944  }
945 
946  // here we have matched all blocks that we need, do the heavy checksum
947  if (!checkChecksumRotated(blkno, ringBuf.get(), blksize, i + 1))
948  continue;
949 
950  // heavy checksum for second block
951  if (rsumseq == 2 && !checkChecksum(blkno + 1, buf2.get(), blksize))
952  continue;
953 
954  // write the first and second blocks if applicable
955  writeBlock(blkno, wfp, ringBuf.get(), blksize, i + 1, found);
956  if (rsumseq == 2)
957  {
958  writeBlock(blkno + 1, wfp, buf2.get(), blksize, 0, found);
959  pushback = 0;
960  blkno++;
961  }
962 
963  // try to continue as long as we still match blocks
964  while (!eof)
965  {
966  blkno++;
967  pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
968  pushbackp = buf2.get();
969  if (!pushback)
970  break;
971 
972  if (!checkRsum(blkno, buf2.get(), blksize))
973  break;
974 
975  if (!checkChecksum(blkno, buf2.get(), blksize))
976  break;
977 
978  writeBlock(blkno, wfp, buf2.get(), blksize, 0, found);
979  pushback = 0;
980  }
981 
982  // if we get to this part we found at least a block, skip over the current block and start reading
983  // in a full block
984  init = true;
985  memset(ringBuf.get(), 0, blksize);
986  a = b = 0;
987  i = size_t(-1); // start with 0 on next iteration
988  break;
989  }
990  }
991  }
992  }
993  else if (chksumlen >= 16)
994  {
995  // dummy variant, just check the checksums
996  size_t bufl = 4096;
997  off_t off = 0;
998  auto buf = std::make_unique<unsigned char[]>( bufl );
999  for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1000  {
1001  if (off > blocks[blkno].off)
1002  continue;
1003  size_t blksize = blocks[blkno].size;
1004  if (blksize > bufl)
1005  {
1006  bufl = blksize;
1007  buf = std::make_unique<unsigned char[]>( bufl );
1008  }
1009  size_t skip = blocks[blkno].off - off;
1010  while (skip)
1011  {
1012  size_t l = skip > bufl ? bufl : skip;
1013  if (fread(buf.get(), l, 1, fp) != 1)
1014  break;
1015  skip -= l;
1016  off += l;
1017  }
1018  if (fread(buf.get(), blksize, 1, fp) != 1)
1019  break;
1020  if (checkChecksum(blkno, buf.get(), blksize))
1021  writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
1022  off += blksize;
1023  }
1024  }
1025  if (!found[nblks]) {
1026  DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
1027  return;
1028  }
1029  // now throw out all of the blocks we found
1030  std::vector<MediaBlock> nblocks;
1031  std::vector<unsigned char> nchksums;
1032  std::vector<unsigned int> nrsums;
1033 
1034  size_t originalSize = 0;
1035  size_t newSize = 0;
1036  for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1037  {
1038  const auto &blk = blocks[blkno];
1039  originalSize += blk.size;
1040  if (!found[blkno])
1041  {
1042  // still need it
1043  nblocks.push_back(blk);
1044  newSize += blk.size;
1045  if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
1046  {
1047  nchksums.resize(nblocks.size() * chksumlen);
1048  memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
1049  }
1050  if (rsumlen && (blkno + 1) <= rsums.size())
1051  nrsums.push_back(rsums[blkno]);
1052  }
1053  }
1054  DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
1055  << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
1056  blocks = nblocks;
1057  chksums = nchksums;
1058  rsums = nrsums;
1059 }
1060 
1061 std::string
1063 {
1064  std::string s;
1065  size_t i, j;
1066 
1067  if (filesize != off_t(-1))
1068  {
1069  long long size = filesize;
1070  s = zypp::str::form("[ BlockList, file size %lld\n", size);
1071  }
1072  else
1073  s = "[ BlockList, filesize unknown\n";
1074  if (!haveblocks)
1075  s += " No block information\n";
1076  if (chksumpad)
1077  s += zypp::str::form(" Checksum pad %zd\n", chksumpad);
1078  if (rsumpad)
1079  s += zypp::str::form(" Rsum pad %zd\n", rsumpad);
1080  for (i = 0; i < blocks.size(); ++i)
1081  {
1082  long long off=blocks[i].off;
1083  long long size=blocks[i].size;
1084  s += zypp::str::form(" (%8lld, %8lld)", off, size);
1085  if (chksumlen && chksums.size() >= (i + 1) * chksumlen)
1086  {
1087  s += " " + chksumtype + ":";
1088  for (j = 0; j < size_t(chksumlen); j++)
1089  s += zypp::str::form("%02hhx", chksums[i * chksumlen + j]);
1090  }
1091  if (rsumlen && rsums.size() > i)
1092  {
1093  s += " RSUM:";
1094  s += zypp::str::form("%0*x", 2 * rsumlen, rsums[i]);
1095  }
1096  s += "\n";
1097  }
1098  s += "]";
1099  return s;
1100 }
1101 
1102  } // namespace media
1103 } // namespace zypp
void reuseBlocksOld(FILE *wfp, std::string filename)
scan a file for blocks from our blocklist.
size_t addBlock(off_t off, size_t size)
add a block with offset off and size size to the block list.
void writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector< bool > &found) const
UByteArray digestVector()
get vector of unsigned char representation of the digest
Definition: Digest.cc:291
std::vector< MediaBlock > blocks
unsigned short b
Compute Message Digests (MD5, SHA1 etc)
Definition: Digest.h:37
void reuseBlocks(FILE *wfp, std::string filename)
void setRsumSequence(uint seq)
how many blocks in sequence need to have the correct checksums to be considered a match ...
bool createDigest(Digest &digest) const
std::vector< unsigned char > chksums
std::vector< unsigned int > rsums
std::string form(const char *format,...) __attribute__((format(printf
Printf style construction of std::string.
Definition: String.cc:36
const UByteArray & getFileChecksum()
bool haveRsum(size_t blkno) const
bool verifyFileDigest(Digest &digest) const
std::string getChecksumType() const
static size_t fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
void setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad=0)
set / verify the (weak) rolling checksum over a single block
bool verifyDigest(size_t blkno, Digest &digest) const
bool checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
a single block from the blocklist, consisting of an offset and a size
bool createFileDigest(Digest &digest) const
bool create(const std::string &name)
initialize creation of a new message digest
Definition: Digest.cc:195
unsigned int updateRsum(unsigned int rs, const char *bytes, size_t len) const
struct zypp::media::MediaBlock __attribute__
UByteArray getChecksum(size_t blkno) const
#define UPDATE_RSUM(a, b, oldc, newc, bshift)
bool haveChecksum(size_t blkno) const
unsigned short a
std::string asString() const
return block list as string
AutoDispose<FILE*> calling ::fclose
Definition: AutoDispose.h:312
bool verifyRsum(size_t blkno, unsigned int rs) const
bool checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
void setFileChecksum(std::string ctype, int cl, unsigned char *c)
set / verify the checksum over the whole file
Easy-to use interface to the ZYPP dependency resolver.
Definition: CodePitfalls.doc:1
void setChecksum(size_t blkno, std::string cstype, int csl, unsigned char *cs, size_t cspad=0)
set / verify the (strong) checksum over a single block
bool update(const char *bytes, size_t len)
feed data into digest computation algorithm
Definition: Digest.cc:309
bool checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
#define DBG
Definition: Logger.h:95
std::string fileChecksumType() const