Ranuga-d's workspace
Runs 
24
Name
6 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
criterion
learning_rate
model
optimizer
Test accuracy
Test f1-score
Test loss
Test precision
Test recall
Train accuracy
Train f1-score
Train loss
Train precision
Train recall
Crashed
ranuga-d
27s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
-
-
-
-
-
-
-
-
-
-
Finished
ranuga-d
1m 57s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=1024, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=1024, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 1024, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-23): 24 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
              )
              (linear1): Linear(in_features=1024, out_features=4096, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=4096, out_features=1024, bias=True)
              (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 1024, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=1024, out_features=1024, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=1024, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
-
-
-
-
-
-
-
-
-
-
Crashed
ranuga-d
51m 24s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.76293
0.76164
45.63215
0.77183
0.76293
0.98521
0.98518
41.89585
0.98606
0.98521
Killed
ranuga-d
23s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
-
-
-
-
-
-
-
-
-
-
Killed
ranuga-d
2m 16s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.73491
0.73403
45.89676
0.75706
0.73491
0.77008
0.76871
44.92231
0.79473
0.77008
Finished
ranuga-d
10m 10s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.8035
0.80347
44.52665
0.81027
0.8035
0.85399
0.85417
44.07314
0.86183
0.85399
Finished
ranuga-d
9m 44s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81568
0.81785
44.44224
0.83202
0.81568
0.85205
0.85433
44.20432
0.87021
0.85205
Finished
ranuga-d
9m 40s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.80138
0.80134
44.54241
0.80848
0.80138
0.86035
0.86067
43.98096
0.86844
0.86035
Finished
ranuga-d
9m 46s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81621
0.8182
44.43667
0.83086
0.81621
0.86123
0.86326
44.06345
0.87755
0.86123
Finished
ranuga-d
9m 32s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81727
0.82157
44.46481
0.84475
0.81727
0.85646
0.85957
44.17048
0.87983
0.85646
Finished
ranuga-d
10m 4s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.8035
0.80295
44.48185
0.81005
0.8035
0.85169
0.85155
44.06789
0.86025
0.85169
Finished
ranuga-d
9m 34s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.82362
0.82663
44.35217
0.8439
0.82362
0.86388
0.86649
44.05035
0.88353
0.86388
Finished
ranuga-d
9m 35s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81674
0.81687
44.32333
0.82524
0.81674
0.86458
0.8648
43.90794
0.87312
0.86458
Finished
ranuga-d
9m 12s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81621
0.81702
44.37612
0.82512
0.81621
0.86511
0.86588
43.9434
0.87547
0.86511
Finished
ranuga-d
9m 4s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.79608
0.79611
44.53491
0.80949
0.79608
0.85452
0.85414
43.99803
0.86285
0.85452
Finished
ranuga-d
9m 38s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.8035
0.80275
44.40342
0.81258
0.8035
0.85187
0.85125
44.01117
0.86037
0.85187
Finished
ranuga-d
8m 34s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 1e-05
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)
0.57256
0.72413
47.40422
1
0.57256
0.56974
0.72135
47.7175
1
0.56974
Finished
ranuga-d
8m 50s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
ASGD (
Parameter Group 0
    alpha: 0.75
    differentiable: False
    foreach: None
    lambd: 0.0001
    lr: 1e-05
    maximize: False
    t0: 1000000.0
    weight_decay: 0
)
0.57945
0.70528
47.51187
0.95095
0.57945
0.57609
0.70936
47.76311
0.96786
0.57609
Finished
ranuga-d
10m 4s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adamax (
Parameter Group 0
    betas: (0.9, 0.999)
    differentiable: False
    eps: 1e-08
    foreach: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.79502
0.79493
44.57689
0.80622
0.79502
0.80809
0.80738
44.71496
0.81738
0.80809
Finished
ranuga-d
10m 15s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0.01
)
0.81727
0.81813
44.36416
0.82776
0.81727
0.86229
0.86333
44.00328
0.87359
0.86229
Finished
ranuga-d
9m 53s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
0.81674
0.81687
44.32333
0.82524
0.81674
0.86458
0.8648
43.90794
0.87312
0.86458
Finished
ranuga-d
9m 54s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adagrad (
Parameter Group 0
    differentiable: False
    eps: 1e-10
    foreach: None
    initial_accumulator_value: 0
    lr: 1e-05
    lr_decay: 0
    maximize: False
    weight_decay: 0
)
0.77013
0.76966
45.01288
0.77882
0.77013
0.76589
0.76552
45.40194
0.77748
0.76589
Finished
ranuga-d
10m 2s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
Adadelta (
Parameter Group 0
    differentiable: False
    eps: 1e-06
    foreach: None
    lr: 1e-05
    maximize: False
    rho: 0.9
    weight_decay: 0
)
0.57256
0.72413
47.40422
1
0.57256
0.56974
0.72135
47.7175
1
0.56974
Finished
ranuga-d
9m 58s
-
CrossEntropyLoss()
0.00001
TL(
  (classifier_head): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
    (activation_fn): ReLU()
  )
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (transformer): TransformerEncoder(
        (token_embedding): Embedding(250002, 768, padding_idx=1)
        (layers): TransformerEncoder(
          (layers): ModuleList(
            (0-11): 12 x TransformerEncoderLayer(
              (self_attn): MultiheadAttention(
                (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
              )
              (linear1): Linear(in_features=768, out_features=3072, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (linear2): Linear(in_features=3072, out_features=768, bias=True)
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout1): Dropout(p=0.1, inplace=False)
              (dropout2): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (positional_embedding): PositionalEmbedding(
          (embedding): Embedding(514, 768, padding_idx=1)
        )
        (embedding_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (head): RobertaClassificationHead(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=768, out_features=2, bias=True)
      (activation_fn): ReLU()
    )
  )
)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0.01
)
0.79237
0.79197
44.54686
0.80645
0.79237
0.84145
0.84062
44.14776
0.85103
0.84145
1-24
of 24