Part2 lesson 9, 03_minibatch_training fastai 2019 course -v3

This session is highly related to what is torch.nn really? by Jeremy Howard.

Q1. Implement basic form of neural network model which recursively compose previous layers, and gives back prediction.
Q2. Implement log_softmax and cross_entropy functions, and calculate loss. How are both functions related to?
Q3. refactor log_softmax using LogSumExp trick. Prove customized version is identical to pytorch’s
Q4. Implement accuracy function and see the result with one small batch of training data and initialized model. After that, do the mini-batch learning with minimal configurations of dl, i.e., forward/loss/backward/optimize. Here you don’t have to validate model.
Q5. This time, use nn.Module.__setattr__ in your customized Model so that you can access model parameter in training. Compare fit (training process) from the previous one. How has it been approved?
Q6. Let’s crack how pytorch overrides setattr method. This time implement customized module, DummyModule, without inheriting nn.Module. Test customized module with model.
Q7. Now suppose we don’t know how much layers we will get, which means you have to leave layers as parameters.(Not attribute inside of model, like l1, l2…etc) Implement your model in three different ways.

You can register module one by one
or you can give list of layers to nn.ModuleList
finally, nn.Sequential does 1) get module list and register it to parameter 2) compose layers

Summary: By far, from the basic training loop, we handed over 1) checking if layer has parameters and 2) compositional function of arbitrary number of layer to pytorch.nn.

Update parameters and make zero of remain gradients.

Q8. make (sgd) Optimizer class which has method of updating parameter and emptying gradients. Test optimizer optimizer by training model.

Hint: nn.Sequential.parameters() is generator. Q9. This time, use pytorch’s module
Q10. Implement Dataset which combines x and y data. Train using that dataset.
Q11. Implement DataLoader so that your loop looks much clearner. Train using that dataloader. Q12. Now implement Sampler which makes training set to be in a random order, and that order should differ each iteration. Combine it with collate function so that you can randomized xb, yb separately. Q13. Use pytorch’s Dataloader A1. ```python3 n,m = x_train.shape c = y_train.max()+1 nh = 50

class Model(nn.Module): def init(self, n_in, nh, n_out): super().init() self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)] def call(self, x): for l in self.layers: x = l(x) return x

pred = Model(m, nh, 10)(x_train)

A2. negative log likelihood represents cross entropy as `x` is one-hot encoded vector, where log softmax is translated to likelihood. 
```python3
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()
def nll(input, target): return -input[range(target.shape[0]), target].mean()
nll(log_softmax(pred), y_train)

A3.

def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()
test_near(logsumexp(pred), pred.logsumexp(-1))
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)
test_near(nll(log_softmax(pred), y_train), loss)
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)
test_near(F.cross_entropy(pred, y_train), loss)

A4.

# see initialized model's performance
def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()
bs=64                  # batch size
xb = x_train[0:bs]     # a mini-batch from x
preds = model(xb)      # predictions
accuracy(preds, yb)

# naive minibatch training
lr, epochs = 0.5, 1
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb)

        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()

accuracy(model(xb), yb)

A5.

class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
    def __call__(self, x): return self.l2(F.relu(self.l1(x)))

model = Model(m, nh, 10)

def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs + 1):
            start_i = i*bs
            end_i = start_i+bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = loss_func(model(xb), yb)

            loss.backward()
            with torch.no_grad():
                for p in model.parameters(): p -= p.grad * lr
                model.zero_grad()

No need to check whether or not model layer has ‘weight’/parameter attribute, since pytorch.nn.Module handels by itself with overriding __setattr__ function.

A6.

class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
        
    def __setattr__(self,k,v):
        if not k.startswith("_"): self._modules[k] = v
        super().__setattr__(k,v)
        
    def __repr__(self): return f'{self._modules}'
    
    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters(): yield p

    def __call__(self, x): return self.l2(F.relu(self.l1(x)))

    def zero_grad(self):
        for p in self.parameters():
            p.grad.data.zero_()

def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs + 1):
            start_i = i*bs
            end_i = start_i+bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = loss_func(model(xb), yb)

            loss.backward()
            with torch.no_grad():
                for p in model.parameters(): p -= p.grad * lr
            model.zero_grad()

A7.

#1
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]

class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i,l in enumerate(self.layers): self.add_module(f'layer_{i}', l)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

model = Model(layers)


#2
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

model = SequentialModel(layers)

#3

model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))

#  A8
class Optimizer():
    def __init__(self, params, lr): self.params, self.lr = params, lr
    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()

    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr

def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs + 1):
            start_i = i*bs
            end_i = start_i+bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            loss = loss_func(model(xb), yb)

            loss.backward()
            opt.step()
            opt.zero_grad()

model = nn.Sequential(nn.Linear(x_train.shape[1], nh), nn.ReLU(), nn.Linear(nh, c))
# accuracy-before train
(model(x_train[:bs]).max(-1).indices == y_train[:bs]).sum()/ bs
opt = Optimizer(model.parameters(), 0.9)
fit()
# accuracy-after train
(model(x_train[:bs]).max(-1).indices == y_train[:bs]).sum()/ bs

from torch import optim
def get_model(model_func, lr=0.9):
    model = nn.Sequential(*model_func())
    return model, optim.SGD(model.parameters(), lr=lr)

def get_layers():
    return nn.Linear(x_train.shape[1], nh), nn.ReLU(), nn.Linear(nh, c)

model, opt = get_model(get_layers, 0.5)
print((model(x_train[:bs]).max(-1).indices == y_train[:bs]).sum()/ bs)
fit()
print((model(x_train[:bs]).max(-1).indices == y_train[:bs]).sum()/ bs)

A10

class Dataset():
    def __init__(self, x, y): self.x, self.y = x, y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i], self.y[i]

train_ds = Dataset(x_train, y_train)
for epoch in range(epochs):
    for i in range((n-1)//bs +1):
        xb, yb = train_ds[i*bs:(i+1)*bs]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()
model = nn.Sequential(nn.Linear(x_train.shape[1], nh), nn.ReLU(), nn.Linear(nh, c))
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)

A11

class DataLoader():
    def __init__(self, ds, bs): self.ds, self.bs = ds, bs
    def __iter__(self):
        for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs]

loss_func = F.cross_entropy
model, opt = get_model(get_layers, lr = 0.001)
train_dl = DataLoader(train_ds, bs)

for epoch in range(epochs):
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_func(pred, yb)
        opt.step()
        opt.zero_grad()

A12

class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n, self.bs, self.shuffle = len(ds), bs, shuffle
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

small_ds = Dataset(*train_ds[:50])
os = Sampler(small_ds, 10, True)
[o for o in os]

os = Sampler(small_ds, 10, True)
[o for o in os]

def collate(batch):
    # ipdb.set_trace()
    xs, ys = zip(*batch)
    return torch.stack(xs), torch.stack(ys)

class DataLoader():
    def __init__(self, ds, sampler, collate_fn = collate):
        self.ds, self.sampler, self.collate_fn = ds, sampler, collate_fn
    def __iter__(self):
        # ipdb.set_trace()
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])        

train_samp = Sampler(small_ds, bs, shuffle=True)
train_dl = DataLoader(small_ds, sampler=train_samp, collate_fn=collate)
next(iter(train_dl))

# unpacking test list
test_list = [('x1', 'y1'), ('x2', 'y2'), ('x3', 'y2')]
list(zip(*test_list))
for i in zip(*test_list):
    print(i)
    break

A13

from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn = collate)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn = collate)

# or

train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, bs, shuffle=False)

Part2 lesson 9, 03_minibatch_training | fastai 2019 course -v3