Skip to main content

Clashluke's group workspace

Timestamps visible
2022-12-06 14:05:59
add_depth: false
2022-12-06 14:05:59
data:
2022-12-06 14:05:59
    datasets_used_per_step: 64
2022-12-06 14:05:59
    deterministic: true
2022-12-06 14:05:59
    interleaved_datasets: 64
2022-12-06 14:05:59
    parallel_workers: 64
2022-12-06 14:05:59
    path: gs://homebrewnlp-eu/the-char-pile/*
2022-12-06 14:05:59
    prefetch_buffer: 1
2022-12-06 14:05:59
    seed: 0
2022-12-06 14:05:59
    shuffle_buffer_gb: 0
2022-12-06 14:05:59
depth: 0
2022-12-06 14:05:59
dims:
2022-12-06 14:05:59
    batch: 256
2022-12-06 14:05:59
    depth: 16
2022-12-06 14:05:59
    features: 256
2022-12-06 14:05:59
    inner_bottleneck_features: 128
2022-12-06 14:05:59
    inner_bottleneck_kernel: 1
2022-12-06 14:05:59
    outer_bottleneck_kernel: 8
2022-12-06 14:05:59
    pointwise_features: 512
2022-12-06 14:05:59
    pointwise_kernel: 2
2022-12-06 14:05:59
    sequence: 4096
2022-12-06 14:05:59
    spatial_mixing_kernel: 512
2022-12-06 14:05:59
    vocab: 256
2022-12-06 14:05:59
fail_on_missing_parameter: true
2022-12-06 14:05:59
global_prefix: ''
2022-12-06 14:05:59
model:
2022-12-06 14:05:59
    autoregressive: true
2022-12-06 14:05:59
    computation_dtype: bfloat16
2022-12-06 14:05:59
    conv_scale: 8
2022-12-06 14:05:59
    conv_shift: 80
2022-12-06 14:05:59
    norm:
2022-12-06 14:05:59
        eps: 1.0e-16
2022-12-06 14:05:59
        power: 2
2022-12-06 14:05:59
        zero_mean: false
2022-12-06 14:05:59
    storage_dtype: float32
2022-12-06 14:05:59
name_cache_offsets: {}
2022-12-06 14:05:59
optimizer:
2022-12-06 14:05:59
    adam_beta1: 0.1
2022-12-06 14:05:59
    adam_beta2: 0.01
2022-12-06 14:05:59
    epsilon: 1.0e-16
2022-12-06 14:05:59
    exponential_decay: 7.0e-06
2022-12-06 14:05:59
    gradient_clip: 0.001
2022-12-06 14:05:59
    heavyball: false
2022-12-06 14:05:59
    learning_rate: 0.01
2022-12-06 14:05:59
    nesterov: false
2022-12-06 14:05:59
    shampoo:
2022-12-06 14:05:59
        beta1: 0.1
2022-12-06 14:05:59
        beta2: 0.01
2022-12-06 14:05:59
        block_size: 512
2022-12-06 14:05:59
        flatten_conv: true
2022-12-06 14:05:59
        flatten_depth: true
2022-12-06 14:05:59
        statistics_compute_steps: 1
2022-12-06 14:05:59
    warmup_end: 1024
2022-12-06 14:05:59
    weight_decay: 0
2022-12-06 14:05:59
parameter_usages: {}
2022-12-06 14:05:59
seed: 0
2022-12-06 14:05:59
training:
2022-12-06 14:05:59
    checkpoint_interval: 256
2022-12-06 14:05:59
    checkpoint_load_path: ''
2022-12-06 14:05:59
    checkpoint_path: gs://homebrewnlp-eu/homebrewnlp-checkpoint-no-decay
2022-12-06 14:05:59
    debug: false
2022-12-06 14:05:59
    device_steps: 1
2022-12-06 14:05:59
    device_unroll: 1
2022-12-06 14:05:59
    do_checkpoint: true
2022-12-06 14:05:59
    steps: 65536
2022-12-06 14:05:59
    trace:
2022-12-06 14:05:59
        do_trace: true
2022-12-06 14:05:59
        output_path: trace
2022-12-06 14:05:59
        start_step: 16
2022-12-06 14:05:59
        stop_step: 80
2022-12-06 14:05:59
    z_loss: 0
2022-12-06 14:05:59
wandb:
2022-12-06 14:05:59
    entity: homebrewnlp
2022-12-06 14:05:59
    group: no-decay
2022-12-06 14:05:59
    id: 4vvyidngdkiakzfg2m37muv7a5o183ip
2022-12-06 14:05:59
    median_sizes:
2022-12-06 14:05:59
    - 64
2022-12-06 14:05:59
    - 256
2022-12-06 14:05:59
    - 1024
2022-12-06 14:05:59
    name: no-decay-0
2022-12-06 14:05:59
    project: gpt
2022-12-06 14:05:59
Initializing dataset..                              Took:     0.68s
2022-12-06 14:06:45
Enqueueing first batch..                            Took:    46.43s
2022-12-06 14:05:59
/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py:2178: UserWarning: The `deterministic` argument has no effect unless the `num_parallel_calls` argument is specified.
2022-12-06 14:05:59
  warnings.warn("The `deterministic` argument has no effect unless the "
2022-12-06 14:05:59
/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py:1712: UserWarning: The `deterministic` argument has no effect unless the `num_parallel_calls` argument is specified.
2022-12-06 14:05:59
  warnings.warn("The `deterministic` argument has no effect unless the "
2022-12-06 14:05:59
/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py:2014: UserWarning: The `deterministic` argument has no effect unless the `num_parallel_calls` argument is specified.
2022-12-06 14:05:59
  warnings.warn("The `deterministic` argument has no effect unless the "
2022-12-06 14:07:09
Acquiring forward parameters..                      Took:    23.82s
2022-12-06 14:12:49
Acquiring optimizer parameters..                    Took:   339.08s
2022-12-06 14:12:49
Parameters: 1,620,705,280
2022-12-06 14:12:49
Buffers:    11,315,445,760