-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinddirdupes.sh
executable file
·221 lines (188 loc) · 7.96 KB
/
finddirdupes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env bash
# check if first argument exists
if ! dir=$(ls -pd "$1"); then
prog=$(basename "$0")
echo "$prog - Finds duplicate folders
First checks the size, then check the hash of folders with the same size.
Usage:
Find duplicate folders inside directory:
$prog dir_name
Specify the minimum size of folders to consider, in bytes:
$prog dir_name min_size
Example: analyze the folders contained in the current folder and ignore the empty ones.
$prog . 5000"
fi
#######################################################
############# define helper function ##################
# function that takes a non-cryptographic hash of a folder which gets calculated from the hash of all the contained files, regardless the file names
hash_dir () {
# it ignores the presence of symbolic links
if [[ -L "$1" ]]; then
>&2 echo Ignoring symbolic link "$1"
elif [[ -d "$1" ]]; then
# checks if the folder is accessible (readable and executable)
# in case it is not, mark this and the parent folders as bad content
# folders with bad content will not be considered for duplicates
if [[ ! -x "$1" || ! -r "$1" ]]; then
>&2 echo Found non-accessible folder "$1", ignoring all the parent folders
echo ////BAD CONTENT////
else
# launch this same function, in a recursive fashion,
# over all the found subdirectories and files
dirhashes=$(ls -1Ap "$1" | while read -r p; do hash_dir "$1$p"; done)
# if any of the subdirectories had something marked as
# "bad content", then mark also this parent folder as bad content
if [[ $dirhashes =~ "////BAD CONTENT////" ]]; then
echo ////BAD CONTENT////
else
# otherwise sort the content by hash (not by file name!)
# and take a hash of the hashes of the subdirectories and files
dirhash=$(echo "$dirhashes" | sort | xxhsum -H3)
# print hash for the parent function to process
echo "$dirhash"
# use a new pipe (as STDOUT is used for marking bad content
# and hashes and STDERR has often errors) for outputting the
# actually interesting output
# this implies, that when calling this function, there has to be
# some redirect in place, ready for capturing pipe 3, e. g.:
# hash_dir ./ 3>&1
# otherwise, you get an error:
# hash_dir ./
# 3: bad file descriptor
>&3 echo "$dirhash" "$1"
fi
fi
# if a non-regular file (e.g. pipes, block devices...) is found, mark
# the directory as bad content
elif [[ ! -f "$1" ]]; then
>&2 echo Found non-regular file "$1", ignoring all the parent folders
echo ////BAD CONTENT////
# if a non-readable file is found, mark the directory as bad content, so that
# it does not get considered for duplicates
elif [[ ! -r "$1" ]]; then
>&2 echo Found non-readable file "$1", ignoring all the parent folders
echo ////BAD CONTENT////
# if the given argument was a simple file, take its hash
else
# the hash function has been selected based on a comparison published
# by its authors:
# https://xxhash.com/#benchmarks
hash=$(xxhsum -H3 < "$1")
echo "$hash"
# when using this function out of the finddirdupes.sh file, you can set
# an environment variable for printing also the file hashes
[ "$ALSO_FILES" ] && >&3 echo "$hash" "$1"
fi
}
# check if the second argument is a number, if so, use it as minimum folder size
# https://stackoverflow.com/a/808740/5033401
if [ -n "$2" ] && [ "$2" -eq "$2" ] 2>/dev/null; then
du_out=$(du -t "$2" "$dir")
else
du_out=$(du "$dir")
fi
#######################################################
########### find folders with the same size ###########
# find how many characters wide is the size column of the du output
du_out_last_line=$(tail -1 <<< "$du_out")
max_size_length=$(echo -n "$du_out_last_line" | cut -d" " -f 1 | wc -c)
# uniq cannot understand what the "first field" is, it wants a number of characters
# but du uses a tab for separating the column,
# so the number of characters of the first column is different for each line
# so I increase the separation between columns adding some padding
du_out_spacer=$(sed 's/\t/...................\//' <<< "$du_out")
# keep only the directories with the same size
du_dups=$(sort <<< "$du_out_spacer" | uniq -D -w "$max_size_length" )
#######################################################
############### eliminate subfolders ##################
# exclude from the folders list all the folders which have a parent folder also in the list
# for example, if the folders list was:
# uno/due
# uno
# tre
# then the parents list will be:
# uno
# tre
# list folders, removing the size
folders=$(cut -d"/" -f 2- <<< "$du_dups")
# append slash
folders_slash=$(sed 's/$/\//' <<< "$folders" )
# remove from the list all the folders which contain the name of another folder plus a slash
# which are the subfolders of another folder in the list
# the -F options is for avoiding the "grep: warning: stray \ before /" message in presence of dirs named "\"
parents=$(grep -v -F -f <(cat <<< "$folders_slash") <<< "$folders")
#######################################################
####### analyze in deep the selected folders ##########
hash_dir_dups=""
while read -r parent; do
# use the hash_dir function for calculating the hash of the folders
# ignore the internal bad content marks
hash_dir_dups+=$(hash_dir "$parent/" 3>&1 | grep -v "////BAD CONTENT////")
# add a newline between folder and folder
hash_dir_dups+="
"
done <<< "$parents"
# keep only the entries with a duplicate hash
hash_dups=$(sort <<< "$hash_dir_dups"| uniq -D -w 32 )
# in presence of empty dirs, the dir field can be empty, I am not sure why...
# so I remove those lines
hash_dups=$(grep "/" <<< "$hash_dups")
#######################################################
############ eliminate subfolders again ###############
# exclude from the folders list all the folders which have a parent folder also in the list
# for example, if the folders list was:
# uno/due
# uno
# tre
# then the parents list will be:
# uno
# tre
# list folders, removing the hash but keeping the trailing slash
dups_slash=$(cut -d"/" -f 2- <<< "$hash_dups")
# remove the trailing slash from the full output
hash_dups_noslash=$(sed 's/\/$//' <<< "$hash_dups")
# remove from the list all the folders which contain the name of another folder plus a slash
# which are the subfolders of another folder in the list
# the -F options is for avoiding the "grep: warning: stray \ before /" message in presence of dirs named "\"
hash_dups_nosub=$(grep -v -F -f <(cat <<< "$dups_slash") <<< "$hash_dups_noslash")
# but this time we need to keep the childs in some cases,
# for example if we have these two groups of duplicates:
# uno/due/
# tre/quattro/
# cinque/
#
# uno/
# tre/
#
# then removing uno/due/ and tre/quattro/ would leave cinque as if it was not a duplicate
# so we take the hash of the parents folders and we look for them in the original list
# here we take the hashes
hashes_of_hash_dups_nosub=$(cut -d" " -f -4 <<< "$hash_dups_nosub")
# and we use the hashes for filtering the original list
hash_dups_allparents=$(grep -f <(cat <<< "$hashes_of_hash_dups_nosub") <<< "$hash_dups")
#######################################################
############## print in a nice format #################
if [[ $hash_dups_allparents ]]; then
# take only the last two fields of the strings and add
# info about the size of the folders
hash_size_dups=$(while read -r _ _ _ hash ddir; do
# check if the second argument is a number, if so,
# use it as minimum folder size
# https://stackoverflow.com/a/808740/5033401
if [ -n "$2" ] && [ "$2" -eq "$2" ] 2>/dev/null; then
size_dir=$(du -s -t "$2" "$ddir")
# if the folder is smaller than $2, do not print anything
if [ -n "$size_dir" ]; then
echo "$hash" "$size_dir"
fi
else
echo "$hash" $(du -s "$ddir")
fi
done <<< "$hash_dups_allparents")
# sort by the size
hash_size_dups_sorted=$(sort -k2,2n -k1,1 <<< "$hash_size_dups")
# add empty lines dividers and remove the hash
uniq --all-repeated=separate -w 16 <<< "$hash_size_dups_sorted" | cut -d" " -f 2-
else
echo No duplicate folders found.
fi