每次看到喜欢的图就存，导至图片有很多存重复的，一直想把重复的图片删掉

发布网友发布时间：2022-04-30 07:50

共5个回答

热心网友时间：2023-10-21 10:32

不知道楼主是否还要哦我也有这个需求，但是没找到感觉合适的软件，于是自己写了一段代码，喜欢的话可以用的。

# -*- coding:utf-8 -*-
from PIL import Image
from PIL import ImageFile
import sys
from webUrlGetor.settings import *

ImageFile.LOAD_TRUNCATED_IMAGES = True

class SavePic(object):
    def __init__(self):
        pass

    def getGray(self, image_file):
        tmpls = []
        for h in range(0, image_file.size[1]):  # h
            for w in range(0, image_file.size[0]):  # w
                tmpls.append(image_file.getpixel((w, h)))

        return tmpls

    def getAvg(self, ls):  # 获取平均灰度值
        return sum(ls) / len(ls)

    def getImgHash(self, fne):
        image_file = Image.open(fne)  # 打开
        image_file = image_file.resize((480, 480))  # 重置图片大小我12px X 12px
        image_file = image_file.convert("L")  # 转256灰度图
        Grayls = self.getGray(image_file)  # 灰度集合
        avg = self.getAvg(Grayls)  # 灰度平均值
        bitls = ''  # 接收获取0或1
        # 除去变宽1px遍历像素
        for h in range(1, image_file.size[1] - 1):  # h
            for w in range(1, image_file.size[0] - 1):  # w
                if image_file.getpixel((w, h)) >= avg:  # 像素的值比较平均值大于记为1 小于记为0
                    bitls = bitls + '1'
                else:
                    bitls = bitls + '0'
        return bitls

    def getMH(self, a, b):  # 比较100个字符有几个字符相同
        dist = 0
        print a
        print b
        for i in range(0, len(a)):
            if a[i] == b[i]:
                dist = dist + 1
        dist_done = format(float(dist) / float(len(a)), accuracy)
        return dist_done

    def compare_pic_hash(self, hash_a, hash_b):
        """
        :param hash_a: 图片A的hash
        :param hash_b: 图片b的hash
        :return:
        """
        compare = self.getMH(hash_a, hash_b)
        return compare

    def get_file_name_list_from_path(self, file_path):
        """
        :return:
        """
        file_name_list = []
        for file_name in os.listdir(file_path):
            if os.path.getsize(os.path.join(file_path, file_name)) < 102400:
                try:
                    os.remove(os.path.join(os.path.join(file_path, file_name)))
                except Exception as e:
                    print "删除失败，原因：", str(e)
            else:
                if not (os.path.split(file_name)[1].endswith(".jpg") or os.path.split(file_name)[1].endswith(".png")):
                    pass
                else:
                    file_name_list.append(file_name)

        return file_name_list

    def get_file_hash_list_from_file_name_list(self, file_path, file_name_list):
        """
        :return:
        """
        file_hash_list = []
        try:
            for i in range(0, len(file_name_list)):
                hash_of_file = self.getImgHash(os.path.join(file_path, file_name_list[i]))
                file_hash_list.append(hash_of_file)
                self.view_bar(i, len(file_name_list))
        except Exception as e:
            print "获取文件hash错误！", str(e)
        return file_hash_list

    def getDocSize(self, path):
        try:
            size = os.path.getsize(path)
            return size
        except Exception as err:
            print(err)

    def view_bar(self, num, total):
        rate = float(num) / total
        rate_num = int(rate * 100) + 1
        r = '\r[%s%s]%d%%' % ("#" * rate_num, " " * (100 - rate_num), rate_num,)
        sys.stdout.write(r)
        sys.stdout.flush()

    def compare_pic_hash_list(self, file_hash_list, file_path, file_name_list):
        """

        :param file_name_list:
        :param file_path:
        :param file_hash_list:
        :return:
        """
        if len(file_name_list) != len(file_hash_list):
            sys.exit("hash list 与 file list下标不一致！")
        total = (len(file_name_list) * (len(file_name_list) - 1)) / 2
        print "\n共需要对比", total
        need_delete_list_file_name = []
        count = 0
        for i in range(0, len(file_hash_list)):
            for j in range(i + 1, len(file_hash_list)):
                compare = self.compare_pic_hash(file_hash_list[i], file_hash_list[j])
                if float(compare) - float(like) > 0:
                    file_i_size = self.getDocSize(os.path.join(file_path, file_name_list[i]))
                    file_j_size = self.getDocSize(os.path.join(file_path, file_name_list[j]))
                    if file_i_size - file_j_size > 0:
                        need_delete_list_file_name.append(file_name_list[j])
                        break
                    else:
                        need_delete_list_file_name.append(file_name_list[i])
                        continue
                count = count + 1
                self.view_bar(count, total)

        news_ids = []
        for need_delete_file_path in need_delete_list_file_name:
            if need_delete_file_path not in news_ids:
                news_ids.append(need_delete_file_path)
        print "\n去重后的需要删除的长度", len(news_ids)

        return news_ids

    def delete_file(self, file_path, need_delete_list_file_name):
        """

        :return:
        """
        for need_delete_file_name in need_delete_list_file_name:
            try:
                os.remove(os.path.join(file_path, need_delete_file_name))
            except Exception as e:
                print "删除失败，原因：", str(e)

    def main(self, file_path):
        """
        aa
        :return:
        """
        # 获取所有文件的文件名，放入list
        # 获取所有文件的hash    多线程
        # 对比文件hash值
        # 如果hash值相似度大于95%
        # 删小留大，否则pass 将需要删除的文件放入待删除list
        # 单线程删除文件
        file_name_list = self.get_file_name_list_from_path(file_path)
        print "文件夹下共包含文件", len(file_name_list), "个"
        file_hash_list = self.get_file_hash_list_from_file_name_list(file_path, file_name_list)
        need_delete_list_file_name = self.compare_pic_hash_list(file_hash_list, file_path, file_name_list)
        need_delete_list_file_name_new = []
        for item in need_delete_list_file_name:
            if "uncensored" not in item:
                need_delete_list_file_name_new.append(item)
        self.delete_file(file_path, need_delete_list_file_name_new)
        return need_delete_list_file_name_new

    def file_name(self, file_dir):
        for root, dirs, files in os.walk(file_dir):
            return dirs  # 当前路径下所有子目录

if __name__ == '__main__':
    x = SavePic()
    now_dir = os.path.dirname(os.path.abspath(__file__))
    project_dir = os.path.split(now_dir)[0]
    img_path = os.path.join(project_dir, "images")
    img_info = x.file_name(img_path)

    # print img_info
    # for i in range(0, len(img_info)):
    #     print "当前对比的文件夹", img_info[i]
    #     for j in range(0, 3):
    #         print "正在进行的去重次数", i
    #         file_patha = os.path.join(img_path, img_info[i])
    #         need_delete_list_file_name = x.main(file_patha)
    #         if len(need_delete_list_file_name) == 0:
    #             break

    for j in range(0, 3):
        print "正在进行的去重次数", j
        # file_patha = os.path.join(img_path, tags)
        file_patha = os.path.join(img_path, "kaetzchen")
        need_delete_list_file_name = x.main(file_patha)
        if len(need_delete_list_file_name) == 0:
            break
    os.system('say "去重完成！"')

热心网友时间：2023-10-21 10:33

存储空间吧不仅能筛选重复的图片还能筛选重复的文档之类的，我自己在用，觉得还不错，挺能清理内存的，

热心网友时间：2023-10-21 10:33

买个小米手机吧，自带的管家，直接识别相似照片

热心网友时间：2023-10-21 10:34

在手机管家上！你打开手机管家点空间清理，看到图片那栏点开就有

热心网友时间：2023-10-21 10:35

最好把手机用数据线连接到电脑，电脑里删除比较快速。

热心网友时间：2023-10-21 10:33

不知道楼主是否还要哦我也有这个需求，但是没找到感觉合适的软件，于是自己写了一段代码，喜欢的话可以用的。

热心网友时间：2023-10-21 10:33

存储空间吧不仅能筛选重复的图片还能筛选重复的文档之类的，我自己在用，觉得还不错，挺能清理内存的，

热心网友时间：2023-10-21 10:33

买个小米手机吧，自带的管家，直接识别相似照片

热心网友时间：2023-10-21 10:34

在手机管家上！你打开手机管家点空间清理，看到图片那栏点开就有

热心网友时间：2023-10-21 10:35

最好把手机用数据线连接到电脑，电脑里删除比较快速。