forked from lengstrom/defensive-distillation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathl0_attack.py
140 lines (103 loc) · 5.05 KB
/
l0_attack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
## l0_attack.py -- Papernot et al.'s l0 attack to find adversarial examples
##
## Copyright (C) 2016, Nicholas Carlini <[email protected]>.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import sys
from robustml_model import MODEL_PATH
import time
import tensorflow as tf
import numpy as np
from setup import *
from model import make_model
TEMPERATURE = 100
if not os.path.exists(MODEL_PATH):
raise RuntimeError('Did you train the models as described in the readme? Run train_distillation.py')
model = make_model(MODEL_PATH)
BATCH_SIZE = 1
img = tf.placeholder(tf.float32, (BATCH_SIZE,IMAGE_SIZE,IMAGE_SIZE,NUM_CHANNELS))
lab = tf.placeholder(tf.float32, (BATCH_SIZE,10))
delta = tf.Variable(tf.zeros((BATCH_SIZE,IMAGE_SIZE,IMAGE_SIZE,NUM_CHANNELS)))
out = tf.nn.softmax(model(img+delta)/TEMPERATURE)
target_probability = tf.reduce_sum(out*lab,0)
other_probability = tf.reduce_sum(out*(1-lab),0)
grads_target = tf.gradients(target_probability, [delta])[0]
grads_other = tf.gradients(other_probability, [delta])[0]
has_setup = False
def modified_papernot_attack(imgs, labs, s, eps=112):
global has_setup
if not has_setup:
s.run(tf.global_variables_initializer())
has_setup = True
total = []
costs = []
for offset in range(0,len(imgs),BATCH_SIZE):
obatch_imgs = imgs[offset:offset+BATCH_SIZE]
batch_labs = labs[offset:offset+BATCH_SIZE]
batch_imgs = np.copy(obatch_imgs)
used = np.zeros((BATCH_SIZE,IMAGE_SIZE,IMAGE_SIZE))
# 1. randomly sample a target which is not the current one
targets = batch_labs
while np.sum(targets*batch_labs) != 0:
targets = np.array([np.identity(10)[random.randint(0,9)] for _ in range(BATCH_SIZE)])
# 2. Try changing pixels up to 112 times
import pdb
pdb.set_trace()
for _ in range(eps):
# 3. Find which ones we've already succeeded on.
the_outs = s.run(out, feed_dict={img: batch_imgs})
success = np.argmax(the_outs,axis=1) == np.argmax(targets,axis=1)
if np.sum(success) == BATCH_SIZE:
# abort early if we are done
break
# 4. Compute the gradients required (alpha and beta)
dir_targ, dir_other = s.run([grads_target, grads_other],
feed_dict={img: batch_imgs, lab:targets})
for e in range(BATCH_SIZE):
if not success[e]:
# 5. Pick the next most important pixel we can change
directions = (-dir_other+dir_targ)
while True:
dirs = np.sum(np.abs(directions),axis=3) * (np.sum(dir_other,axis=3) > 0) * (np.sum(dir_targ,axis=3) < 0) * (1-used[e])
highest = np.argmax(dirs[e,:,:])
x,y = highest%IMAGE_SIZE, highest//IMAGE_SIZE
curval = batch_imgs[e,y,x,:]
change = np.sign(directions[e,y,x])
if np.all(change == 0):
break
if abs(curval+change) < 1.499:
# 6. Actually change it by the right direction
used[e,y,x] += 1
batch_imgs[e,y,x,:] += change
batch_imgs = np.clip(batch_imgs, -.5, .5)
break
else:
directions[e,y,x] = 0
# Recompute the success probability
the_outs = s.run(out, feed_dict={img: batch_imgs})
success = (np.argmax(the_outs,axis=1) == np.argmax(targets,axis=1))
# Count the number of pixels we had to change
different = (batch_imgs!=obatch_imgs).reshape((BATCH_SIZE,IMAGE_SIZE**2,NUM_CHANNELS))
different = np.any(different,axis=2)
# And success requires we change fewer than 112 pixels.
success &= np.sum(different,axis=1) < eps
costs.extend(np.sum(different,axis=1))
total.extend(success)
print(np.mean(costs),np.mean(total))
return batch_imgs
if __name__ == "__main__":
with tf.Session() as s:
print("Number of pixels changed / Probability of Attack Success")
print(modified_papernot_attack(test_data[:10000], test_labels[:10000], s, 112))