dedup.cc 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. // Copyright (c) 2018-present, Facebook, Inc.
  2. // All rights reserved.
  3. //
  4. // This source code is licensed under the MIT license found in the
  5. // LICENSE file in the root directory of this source tree.
  6. #include <cstdint>
  7. #include <iostream>
  8. #include <fstream>
  9. #include <string>
  10. #include <vector>
  11. uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
  12. {
  13. for (size_t i = 0; i < sz; i++, data++) {
  14. h ^= uint64_t(*data);
  15. h *= 1099511628211ull;
  16. }
  17. return h;
  18. }
  19. int main(int argc, char** argv)
  20. {
  21. uint64_t init_values[] = {
  22. 14695981039346656037ull,
  23. 9425296925403859339ull,
  24. 13716263814064014149ull,
  25. 3525492407291847033ull,
  26. 8607404175481815707ull,
  27. 9818874561736458749ull,
  28. 10026508429719773353ull,
  29. 3560712257386009938ull
  30. };
  31. size_t n = 1ull<<34, num_hashes = 2;
  32. std::vector<bool> seen(n);
  33. std::ios_base::sync_with_stdio(false);
  34. for (std::string line; std::getline(std::cin, line);) {
  35. bool b = true;
  36. for (size_t i = 0; i < num_hashes; i++) {
  37. uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
  38. b = b && seen[h];
  39. seen[h] = true;
  40. }
  41. if (!b) {
  42. std::cout << line << std::endl;
  43. }
  44. }
  45. return 0;
  46. }