mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
Fix hashing for deduplication (#17048)
This commit is contained in:
parent
39f8eafc1b
commit
db034660fb
1 changed files with 2 additions and 1 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import gzip
|
||||
import hashlib
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
|
|
@ -13,7 +14,7 @@ from transformers import HfArgumentParser
|
|||
|
||||
def get_hash(example):
|
||||
"""Get hash of content field."""
|
||||
return {"hash": hash(example["content"])}
|
||||
return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
|
||||
|
||||
|
||||
def line_stats(example):
|
||||
|
|
|
|||
Loading…
Reference in a new issue