From db034660fbfefdfef1d9f7b71d25207be774726d Mon Sep 17 00:00:00 2001 From: Thomas Wang <24695242+thomasw21@users.noreply.github.com> Date: Wed, 4 May 2022 08:40:24 +0200 Subject: [PATCH] Fix hashing for deduplication (#17048) --- examples/research_projects/codeparrot/scripts/preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py index bb037750a..4e09379a9 100644 --- a/examples/research_projects/codeparrot/scripts/preprocessing.py +++ b/examples/research_projects/codeparrot/scripts/preprocessing.py @@ -1,4 +1,5 @@ import gzip +import hashlib import multiprocessing import os import shutil @@ -13,7 +14,7 @@ from transformers import HfArgumentParser def get_hash(example): """Get hash of content field.""" - return {"hash": hash(example["content"])} + return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()} def line_stats(example):