diff --git a/model.py b/model.py
index 0f481656..4ed1208c 100644
--- a/model.py
+++ b/model.py
@@ -14,6 +14,40 @@
 }
 supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
 
+class ClippedReLU(nn.Module):
+    r"""Applies the rectified linear unit function element-wise
+    :math:`{ReLU}(x)= max(0, x)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.ReLU()
+        >>> input = autograd.Variable(torch.randn(2))
+        >>> print(input)
+        >>> print(m(input))
+    """
+
+    def __init__(self, max_val=20.0, inplace=False):
+        super(ClippedReLU, self).__init__()
+        self.max_val = max_val
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.threshold(input, 0, 0, self.inplace).clamp(min=0.0, max=self.max_val)
+        #return F.threshold(input, 0, 0).clamp(min=0.0, max=self.max_val)
+
+
+    def __repr__(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return self.__class__.__name__ + '(' \
+            + inplace_str + ')'
 
 class SequenceWise(nn.Module):
     def __init__(self, module):
@@ -65,7 +99,7 @@ def flatten_parameters(self):
     def forward(self, x):
         if self.batch_norm is not None:
             x = self.batch_norm(x)
-        x, _ = self.rnn(x)
+        x, _ = self.rnn(x, hx=Variable(torch.zeros(2,1,600)))
         if self.bidirectional:
             x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
         return x
@@ -134,10 +168,12 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
         self.conv = nn.Sequential(
             nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(0, 10)),
             nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True),
+            ClippedReLU(),
+            #nn.Hardtanh(0, 20, inplace=True),
             nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), ),
             nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True)
+            #nn.Hardtanh(0, 20, inplace=True),
+            ClippedReLU()
         )
         # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
         rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
@@ -157,7 +193,8 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
         self.lookahead = nn.Sequential(
             # consider adding batch norm?
             Lookahead(rnn_hidden_size, context=context),
-            nn.Hardtanh(0, 20, inplace=True)
+            #nn.Hardtanh(0, 20, inplace=True)
+            ClippedReLU()
         ) if not bidirectional else None
 
         fully_connected = nn.Sequential(
diff --git a/train.py b/train.py
index 020ef0d6..ff01a968 100644
--- a/train.py
+++ b/train.py
@@ -242,7 +242,6 @@ def update(self, val, n=1):
             sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)
 
             loss = criterion(out, targets, sizes, target_sizes)
-            loss = loss / inputs.size(0)  # average the loss by minibatch
 
             loss_sum = loss.data.sum()
             inf = float("inf")