diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e46d55..3e89e8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,8 +48,7 @@ if (GGML_SANITIZE_UNDEFINED)
 endif()
 
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
 
 # dependencies
 
diff --git a/examples/common.cpp b/examples/common.cpp
index fc45999..4f5dead 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -35,6 +35,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.n_batch = std::stoi(argv[++i]);
         } else if (arg == "-m" || arg == "--model") {
             params.model = argv[++i];
+        } else if (arg == "-q" || arg == "--quiet") {
+            params.quiet=1;
+        } else if (arg == "-Q" || arg == "--silent") {
+            params.silent=1; params.quiet=1;
+        } else if (arg == "-r" || arg == "--regex") {
+            params.regex = argv[++i];
         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
@@ -64,6 +70,12 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -q , --quiet\n");
+    fprintf(stderr, "                        output only prompt and predictions\n");
+    fprintf(stderr, "  -Q , --silent\n");
+    fprintf(stderr, "                        output only predictions\n");
+    fprintf(stderr, "  -r , --regex REGEX\n");
+    fprintf(stderr, "                        when output matches regex prediction will terminate\n");
     fprintf(stderr, "\n");
 }
 
diff --git a/examples/common.h b/examples/common.h
index b08e576..65ca187 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -27,7 +27,10 @@ struct gpt_params {
     int32_t n_batch = 8; // batch size for prompt processing
 
     std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
+    bool silent = 0;
+    bool quiet  = 0;
     std::string prompt;
+    std::string regex;
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
index 99ed440..2c9d36e 100644
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -13,7 +13,9 @@
 #include <vector>
 #include <iostream>
 #include <unistd.h>
+#include <regex.h>
 
+#define OUTMAX (2048*10)
 // default hparams (GPT-J 6B)
 struct gptj_hparams {
     int32_t n_vocab = 50400;
@@ -25,6 +27,8 @@ struct gptj_hparams {
     int32_t ftype   = 1;
 };
 
+bool quiet=0;
+bool silent=0;
 struct gptj_layer {
     // normalization
     struct ggml_tensor * ln_1_g;
@@ -70,7 +74,7 @@ struct gptj_model {
 
 // load the model's weights from a file
 bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+    if(!quiet) printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
@@ -100,13 +104,13 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
         fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        if(!quiet)printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        if(!quiet)printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        if(!quiet)printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        if(!quiet)printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        if(!quiet)printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        if(!quiet)printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        if(!quiet)printf("%s: ftype   = %d\n", __func__, hparams.ftype);
     }
 
     // load vocab
@@ -182,7 +186,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
 
         ctx_size += (5 + 10*n_layer)*256; // object overhead
 
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        if(!quiet)printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 
     // create the ggml context
@@ -280,7 +284,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        if(!quiet)printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
 
     // load weights
@@ -288,7 +292,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
         int n_tensors = 0;
         size_t total_size = 0;
 
-        printf("%s: ", __func__);
+        if(!quiet)printf("%s: ", __func__);
 
         while (true) {
             int32_t n_dims;
@@ -332,7 +336,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
 
             // for debugging
             if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                if(!quiet)printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
             const size_t bpe = ggml_type_size(ggml_type(ttype));
@@ -348,14 +352,14 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
             total_size += ggml_nbytes(tensor);
             if (++n_tensors % 8 == 0) {
-                printf(".");
+                if(!quiet)printf(".");
                 fflush(stdout);
             }
         }
 
-        printf(" done\n");
+        if(!quiet)printf(" done\n");
 
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+        if(!quiet)printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
     }
 
     fin.close();
@@ -601,8 +605,10 @@ bool gptj_eval(
 
 int main(int argc, char ** argv) {
     const int64_t t_main_start_us = ggml_time_us();
-
+    char out[OUTMAX+1];
+    regex_t regex;
     gpt_params params;
+    memset(out,0,OUTMAX);
     params.model = "models/gpt-j-6B/ggml-model.bin";
 
     if (gpt_params_parse(argc, argv, params) == false) {
@@ -613,7 +619,19 @@ int main(int argc, char ** argv) {
         params.seed = time(NULL);
     }
 
-    printf("%s: seed = %d\n", __func__, params.seed);
+    quiet=params.quiet;
+    silent=params.silent;
+
+    if(!(params.regex.empty())) {
+        if(regcomp(&regex,params.regex.c_str(),0)){
+        fprintf(stderr,"Bad regex '%s'",params.regex.c_str());
+        exit(1);
+        } else {
+            if(!quiet)printf("regex = '%s'\n",params.regex.c_str());
+        }
+    }
+
+    if(!quiet)printf("%s: seed = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.prompt.empty()) {
@@ -656,8 +674,8 @@ int main(int argc, char ** argv) {
 
     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    printf("\n");
+    if(!quiet)printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    if(!quiet)printf("\n");
 
     std::vector<gpt_vocab::id> embd;
 
@@ -701,10 +719,17 @@ int main(int argc, char ** argv) {
 
             // add it to the context
             embd.push_back(id);
+            //concat to out str
+            strncat(out,vocab.id_to_token[id].c_str(),OUTMAX);
+            //display text
+             printf("%s", vocab.id_to_token[id].c_str());
         } else {
             // if here, it means we are still processing the input prompt
             for (int k = i; k < embd_inp.size(); k++) {
                 embd.push_back(embd_inp[k]);
+                //display processed prompt text if not in silent mode
+                if(!silent) printf("%s", vocab.id_to_token[embd_inp[k]].c_str());
+                fflush(stdout);
                 if (embd.size() > params.n_batch) {
                     break;
                 }
@@ -712,12 +737,13 @@ int main(int argc, char ** argv) {
             i += embd.size() - 1;
         }
 
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
+        if(!params.regex.empty()) {
+            if(!regexec(&regex,out,0,NULL,0)){
+                if(!quiet)printf("\nMatched reverse prompt.\n");
+                break;
+            }
         }
         fflush(stdout);
-
         // end of text token
         if (embd.back() == 50256) {
             break;
@@ -728,12 +754,12 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_main_end_us = ggml_time_us();
 
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        if(!quiet)printf("\n\n");
+        if(!quiet)printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        if(!quiet)printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        if(!quiet)printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        if(!quiet)printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        if(!quiet)printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
     }
 
     ggml_free(model.ctx);