Fix hashing for deduplication (#17048)

This commit is contained in:
Thomas Wang 2022-05-04 08:40:24 +02:00 committed by GitHub
parent 39f8eafc1b
commit db034660fb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,4 +1,5 @@
import gzip
import hashlib
import multiprocessing
import os
import shutil
@ -13,7 +14,7 @@ from transformers import HfArgumentParser
def get_hash(example):
"""Get hash of content field."""
return {"hash": hash(example["content"])}
return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
def line_stats(example):