diff --git a/codegeex/megatron/arguments.py b/codegeex/megatron/arguments.py
index e63b7be..732f877 100644
--- a/codegeex/megatron/arguments.py
+++ b/codegeex/megatron/arguments.py
@@ -415,6 +415,10 @@ def _add_network_size_args(parser):
         help="Disable BERT binary head.",
         dest="bert_binary_head",
     )
+    group.add_argument(
+        "--compress",
+        action="store_true",
+    )
 
     return parser
 
diff --git a/codegeex/megatron/model/transformer.py b/codegeex/megatron/model/transformer.py
index 4dd761e..733ca84 100644
--- a/codegeex/megatron/model/transformer.py
+++ b/codegeex/megatron/model/transformer.py
@@ -61,14 +61,19 @@ class ParallelMLP(MegatronModule):
     applied.
     """
 
-    def __init__(self, init_method, output_layer_init_method):
+    def __init__(
+        self,
+        init_method,
+        output_layer_init_method,
+        scale: int = 4,
+    ):
         super(ParallelMLP, self).__init__()
         args = get_args()
 
         # Project to 4h.
         self.dense_h_to_4h = mpu.ColumnParallelLinear(
             args.hidden_size,
-            4 * args.hidden_size,
+            scale * args.hidden_size,
             gather_output=False,
             init_method=init_method,
             # skip_bias_add=True,
@@ -78,7 +83,7 @@ class ParallelMLP(MegatronModule):
 
         # Project back to h.
         self.dense_4h_to_h = mpu.RowParallelLinear(
-            4 * args.hidden_size,
+            scale * args.hidden_size,
             args.hidden_size,
             input_is_parallel=True if args.tensor_model_parallel_size > 1 else False,
             init_method=output_layer_init_method,
@@ -264,7 +269,7 @@ class ParallelSelfAttention(MegatronModule):
         if self.attention_softmax_in_fp32:
             attention_probs = self.softmax(attention_scores.float()).half()
         else:
-            attention_probs = self.softmax(attention_scores)
+            attention_probs = self.softmax(attention_scores.half())
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -485,7 +490,7 @@ class ParallelTopQuerySelfAttention(MegatronModule):
         if self.attention_softmax_in_fp32:
             attention_probs = self.softmax(attention_scores.float()).half()
         else:
-            attention_probs = self.softmax(attention_scores)
+            attention_probs = self.softmax(attention_scores.half())
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -607,7 +612,8 @@ class ParallelTransformerLayer(MegatronModule):
             self.ln_fp16 = False
             # MLP
         self.mlp = ParallelMLP(init_method,
-                               output_layer_init_method)
+                               output_layer_init_method,
+                               scale=2 if args.compress else 4)
 
     def forward(
             self,
diff --git a/codegeex/megatron/training.py b/codegeex/megatron/training.py
index 2c0ffc4..b98f265 100644
--- a/codegeex/megatron/training.py
+++ b/codegeex/megatron/training.py
@@ -65,12 +65,6 @@ except ImportError:
 from filelock import FileLock
 import pathlib
 
-try:
-    import bmcook
-    from bmcook import Config
-except ImportError:
-    print("bmcook not imported.")
-    bmcook = None
 
 
 def print_datetime(string):
@@ -80,11 +74,6 @@ def print_datetime(string):
     print_rank_0("[" + string + "] datetime: {} ".format(time_str))
 
 
-def compress_setup(args, model, optimizer):
-    teacher = get_model(args)
-    cook_config = ConfigParser(args.cook_config)
-    CPMAntTrainer.set_compression(cook_config, model, optimizer, teacher=teacher, remove_ckptblock=False, target_linear=Linear)
-
 def pretrain(
     train_valid_test_dataset_provider,
     model_provider,