Tensorflow-Implementierung von BinaryConnect

Als Promotionsstudent, besonders im ersten Jahr, habe ich ein paar hunderte von Forschungsartikeln gelesen, in meinem Fall über Biometrische Systeme, Künstliche Intelligenz, Machine Learning, Deep Learning (tiefe neuronale Netze) und deren hardware-implementierung auf FPGAs, ASIC, CPU, GPU, etc.

Im Herbst 2017 kamm ich auf ein Forschungsartikel der „BinaryConnect: Training Deep Neural Networks with binary weights during propagations“  heißt. Auf deutsch heißt es „BinaryConnect: Training von tiefen neuronalen Netzen mit binären Gewichten während der Ausbreitung“.  Mehr darüber können Sie hier lesen: https://arxiv.org/abs/1511.00363 oder hier: https://github.com/MatthieuCourbariaux/BinaryConnect

Da ich nierdgendwo eine implementierung von BinaryConnect in Tensorflow gesehen habe, und weil viele Menschen so eine suchen, wie z.B. in den Screenshot hier:

habe ich mich entschieden die erste Tensorflow implementierung von BinaryConnect zu machen die ich nur hier veröffentliche:

A) network.py Datei:

# Tensorflow implementation of BinaryConnect using MNIST created by Sorin Liviu Jurj (www.jurj.de)

import time
import numpy as np
import tensorflow as tf

def hard_sigmoid(x):
return tf.clip_by_value((x + 1.) / 2., 0, 1)

# The binarization function
def binarization(W, H, binary=True, deterministic=False, stochastic=False):
# (deterministic == True) <-> test-time <-> inference-time
if not binary or (deterministic and stochastic):
# print("not binary")
Wb = W

else:
# [-1,1] -> [0,1]
Wb = hard_sigmoid(W / H)

# Stochastic BinaryConnect
if stochastic:

# print("stoch")
Wb = tf.ceil(Wb - tf.random_uniform(tf.shape(Wb)))

# Deterministic BinaryConnect (round to nearest)
else:
# print("det")
Wb = tf.round(Wb)

# 0 or 1 -> -1 or 1
Wb = tf.cast(tf.where(tf.cast(Wb, tf.bool), x=tf.fill(tf.shape(Wb), H), y=tf.fill(tf.shape(Wb), -H)), tf.float32)

return Wb

# This class extends the Lasagne DenseLayer to support BinaryConnect
class DenseLayer:
def __init__(self, name, incoming, num_units,
binary=True, stochastic=True, H=1., W_LR_scale="Glorot", nonlinearity=None):

num_inputs = incoming.shape[-1].value
self.binary = binary
self.stochastic = stochastic
self.nonlinearity = nonlinearity

self.input = incoming
self.units = num_units

self.H = H
if H == "Glorot":
self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))
# print("H = "+str(self.H))

self.W_LR_scale = W_LR_scale
if W_LR_scale == "Glorot":
self.W_LR_scale = np.float32(1. / np.sqrt(1.5 / (num_inputs + num_units)))

if self.binary:
initial = tf.random_uniform([num_inputs, num_units], minval=-H, maxval=H)
self.W = tf.Variable(initial, name=name + "_W")
initial_b = tf.constant(0.1, shape=[num_units])
self.b = tf.Variable(initial_b, name=name + "_b")
else:
initial = tf.truncated_normal(shape=[num_inputs, num_units], stddev=0.1)
self.W = tf.Variable(initial, name=name + "_W")
initial_b = tf.constant(0.1, shape=[num_units])
self.b = tf.Variable(initial_b, name=name + "_b")

def get_output(self, deterministic=False):
self.Wb = binarization(self.W, self.H, self.binary, deterministic, self.stochastic)
Wr = self.W
self.W = self.Wb

# operatorul @ reprezinta inmultirea de matrici ca si in tf.matmul(x, y)
outputs = self.input @ self.W + self.b

self.W = Wr

return self.nonlinearity(outputs)

# This class extends the Lasagne Conv2DLayer to support BinaryConnect
class Conv2DLayer:
def __init__(self, name, incoming, num_filters, filter_size, strides,
binary=True, stochastic=True, H=1., W_LR_scale="Glorot", nonlinearity=None):

num_inputs = incoming.shape[-1].value
self.binary = binary
self.stochastic = stochastic
self.nonlinearity = nonlinearity

self.input = incoming
self.strides = strides

self.H = H
if H == "Glorot":
num_units = int(np.prod(filter_size) * num_filters)
self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units)))

self.W_LR_scale = W_LR_scale
if W_LR_scale == "Glorot":
num_units = int(np.prod(filter_size) * num_filters)
self.W_LR_scale = np.float32(1. / np.sqrt(1.5 / (num_inputs + num_units)))

if self.binary:
initial = tf.random_uniform([filter_size[0], filter_size[1], num_inputs, num_filters], minval=-H, maxval=H)
self.W = tf.Variable(initial, name=name + "_W")
initial_b = tf.constant(0.1, shape=[num_filters])
self.b = tf.Variable(initial_b, name=name + "_b")
else:
initial = tf.truncated_normal(shape=[filter_size[0], filter_size[1], num_inputs, num_filters], stddev=0.1)
self.W = tf.Variable(initial, name=name + "_W")
initial_b = tf.constant(0.1, shape=[num_filters])
self.b = tf.Variable(initial_b, name=name + "_b")

def convolve(self, deterministic=False):
self.Wb = binarization(self.W, self.H, self.binary, deterministic, self.stochastic)
Wr = self.W
self.W = self.Wb

rvalue = tf.nn.conv2d(self.input, self.W, strides=self.strides, padding='SAME') + self.b

self.W = Wr

return self.nonlinearity(rvalue)

# This functions clips the weights after the parameter update
def clipping_scaling(updates, network, lr):
_i = 0
op = []
for layer in network:
# Assume that every layer only has weights and bias as trainable params
# Manually apply gradients to clipped parameters

if updates[_i][0] is not None:
param = layer.W + layer.W_LR_scale * (updates[_i][0] - layer.W )
param = tf.clip_by_value(param, -layer.H, layer.H)

op.append(tf.assign(layer.W, tf.add(updates[_i][0] * lr, param)))
_i += 1

if updates[_i][0] is not None:
param = layer.b + layer.W_LR_scale * (updates[_i][0] - layer.b)
param = tf.clip_by_value(param, -layer.H, layer.H)

op.append(tf.assign(layer.b, tf.add(updates[_i][0] * lr, param)))
_i += 1

return op

# Given a dataset and a model, this function trains the model on the dataset for several epochs
# (There is no default train function in Lasagne yet)
def train(loss_fn, train_fn, acc_op,
input_holder, target_holder,
batch_size,
lr_holder, LR_start, LR_decay,
num_epochs,
data_set):
# This function trains the model a full epoch (on the whole dataset)
def train_epoch(LR, sess):

loss = 0
# 50000 reprezinta numarul de imagini pe care le folosim din setul de date pentru training (din 60000)
# iar 10000 sunt in setul pentru test, asa ca la validare folosim tot atatea
batches = 50000 // batch_size

for i in range(batches):
batch = data_set.train.next_batch(batch_size)
# feed_dict e un dictionar prin care se transmit date catre Tensorii inlocuitori
l, _ = sess.run([loss_fn, train_fn], feed_dict={input_holder: batch[0],
target_holder: batch[1],
lr_holder: LR})
loss += l

loss /= batches

return loss

# This function tests the model a full epoch (on the whole dataset)
def val_epoch(data, sess):
loss = 0
acc = 0
batches = 10000 // batch_size

for i in range(batches):
batch = data.next_batch(batch_size)
l, a = sess.run([loss_fn, acc_op], feed_dict={input_holder: batch[0],
target_holder: batch[1]})
loss += l
acc += a

acc = acc / batches * 100

return loss, acc

# shuffle the train set
best_val_acc = 0
best_epoch = 1
LR = LR_start

# A Session object encapsulates the environment in which Operation objects are executed,
# and Tensor objects are evaluated
# A Tensor is a symbolic handle to one of the outputs of an Operation. It does not hold the values of that
# operation's output, but instead provides a means of computing those values in a TensorFlow tf.Session
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())

# We iterate over epochs:
for epoch in range(num_epochs):

start_time = time.time()

train_loss = train_epoch(LR, sess)

val_loss, val_acc = val_epoch(data_set.train, sess)

# test if validation error went down
if val_acc >= best_val_acc:
best_val_acc = val_acc
best_epoch = epoch + 1

test_loss, test_acc = val_epoch(data_set.test, sess)

epoch_duration = time.time() - start_time

# Then we print the results for this epoch:
print("Epoch " + str(epoch + 1) + " of " + str(num_epochs) + " took " + str(epoch_duration) + "s")
print(" LR: " + str(LR))
print(" training loss: " + str(train_loss))
print(" validation accuracy: " + str(val_acc) + "%")
print(" validation loss: " + str(val_loss))
print(" best epoch: " + str(best_epoch))
print(" best validation accuracy: " + str(best_val_acc) + "%")
print(" test accuracy: " + str(test_acc) + "%")

# decay the LR
LR *= LR_decay

B) run.py Datei:

# Tensorflow implementation of BinaryConnect using MNIST created by Sorin Liviu Jurj (www.jurj.de)
# Acest cod a fost scris pentru a fi utilizat pe setul de date MNIST
# In mare am respectat arhitectura ceruta, modificarile majore pe care
# le-am facut, si pentru care nu gasesti commentari in cod sunt:
# - arhitectura retelei, implementarea de pe github nici nu folosea Conv2DLayer asa ca am folosit o arhitectura standard
# - felul in care sunt preprocesate datele, deoarece modulul pylearn2 functioneaza doar cu Theano,
# asa ca trebuie sa download-ezi manual setul de date de pe site-ul mentionat mai jos
# - variate schimbari necesare pentru implementarea in Tensorflow

# Nu am optimizat hiperparametrii
# Mai este loc de imbunatatiri, asa ca poti sa te joci cu acest cod linistit

import numpy as np

import network

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

np.random.seed(1234) # for reproducibility

# Replace this with the path to MNIST data downloaded from http://yann.lecun.com/exdb/mnist/
data_dir = '/tmp/BinaryConnect-Tensorflow/'
mnist = input_data.read_data_sets(data_dir, one_hot=False)

if __name__ == "__main__":

# BN parameters
batch_size = 100
print("batch_size = " + str(batch_size))
# alpha is the exponential moving average factor

# Training parameters
num_epochs = 250
print("num_epochs = " + str(num_epochs))

# Dropout parameters
dropout_in = 1 # 1 means no dropout
print("dropout_in = " + str(dropout_in))
dropout_hidden = 1
print("dropout_hidden = " + str(dropout_hidden))

# BinaryConnect
binary = False
print("binary = " + str(binary))
stochastic = True
print("stochastic = " + str(stochastic))
# (-H,+H) are the two binary values
# H = "Glorot"
H = 1.
print("H = " + str(H))
# W_LR_scale = 1.
W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper
print("W_LR_scale = " + str(W_LR_scale))

# Decaying LR
LR_start = .001 # Use .0001 if not binary
print("LR_start = " + str(LR_start))
LR_fin = 0.000003
print("LR_fin = " + str(LR_fin))
LR_decay = (LR_fin / LR_start) ** (1. / num_epochs)
print("LR_decay = " + str(LR_decay))
# BTW, LR decay might good for the BN moving average...

print('Loading MNIST dataset...')

print('Building the MLP...')

nn = []

with tf.Graph().as_default():
# acestea trebuie "umplute" cu date atunci cand ruleaza orice Tensor care le folosesc
input_holder = tf.placeholder(tf.float32, [None, 784])
target_holder = tf.placeholder(tf.int64, [None])
# learning rate variabil pentru o implementare mai usoara a learning rate decay
lr_holder = tf.placeholder(tf.float32, [])

input = tf.reshape(input_holder, [-1, 28, 28, 1])

mlp = network.Conv2DLayer(
"conv_layer_1",
incoming=input,
binary=binary,
stochastic=stochastic,
H=H,
nonlinearity=tf.nn.relu,
num_filters=32,
filter_size=[5, 5],
strides=[1, 1, 1, 1]
)
nn.append(mlp)
mlp = tf.nn.max_pool(mlp.convolve(), ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
mlp = network.Conv2DLayer(
"conv_layer_2",
incoming=mlp,
binary=binary,
stochastic=stochastic,
H=H,
nonlinearity=tf.nn.relu,
num_filters=64,
filter_size=[5, 5],
strides=[1, 1, 1, 1]
)
nn.append(mlp)
mlp = tf.nn.max_pool(mlp.convolve(), ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')

mlp = tf.reshape(mlp, [-1, 7 * 7 * 64])

mlp = network.DenseLayer(
"dense_layer_1",
incoming=mlp,
binary=binary,
stochastic=stochastic,
H=H,
nonlinearity=tf.nn.relu,
num_units=1024)
nn.append(mlp)

mlp = tf.nn.dropout(mlp.get_output(), dropout_hidden)

mlp = network.DenseLayer(
"output_dense_layer",
incoming=mlp,
binary=binary,
stochastic=stochastic,
H=H,
nonlinearity=tf.identity,
num_units=10)
nn.append(mlp)

output = mlp.get_output()

# sparse deoarece label-urile nu sunt sub forma onehot
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, labels=target_holder)
loss = tf.reduce_mean(loss)

optimizer = tf.train.AdamOptimizer(learning_rate=lr_holder)

# operatia de actualizare a parametrilor este facuta internal de catre Tensorflow
updates = optimizer.minimize(loss)

prediction = tf.argmax(output, 1)
equality = tf.equal(prediction, target_holder)
accuracy = tf.reduce_mean(tf.cast(equality, tf.float32))

print('Training...')

network.train(
loss,updates, accuracy,
input_holder, target_holder,
batch_size,
lr_holder, LR_start, LR_decay,
num_epochs,
mnist)

Leave a Comment

Diese Website verwendet Akismet, um Spam zu reduzieren. Erfahren Sie mehr darüber, wie Ihre Kommentardaten verarbeitet werden .