diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e46d55..3e89e8b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,8 +48,7 @@ if (GGML_SANITIZE_UNDEFINED) endif() #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math") -#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") -#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # dependencies diff --git a/examples/common.cpp b/examples/common.cpp index fc45999..4f5dead 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -35,6 +35,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_batch = std::stoi(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; + } else if (arg == "-q" || arg == "--quiet") { + params.quiet=1; + } else if (arg == "-Q" || arg == "--silent") { + params.silent=1; params.quiet=1; + } else if (arg == "-r" || arg == "--regex") { + params.regex = argv[++i]; } else if (arg == "-h" || arg == "--help") { gpt_print_usage(argc, argv, params); exit(0); @@ -64,6 +70,12 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -q , --quiet\n"); + fprintf(stderr, " output only prompt and predictions\n"); + fprintf(stderr, " -Q , --silent\n"); + fprintf(stderr, " output only predictions\n"); + fprintf(stderr, " -r , --regex REGEX\n"); + fprintf(stderr, " when output matches regex prediction will terminate\n"); fprintf(stderr, "\n"); } diff --git a/examples/common.h b/examples/common.h index b08e576..65ca187 100644 --- a/examples/common.h +++ b/examples/common.h @@ -27,7 +27,10 @@ struct gpt_params { int32_t n_batch = 8; // batch size for prompt processing std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path + bool silent = 0; + bool quiet = 0; std::string prompt; + std::string regex; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index 99ed440..2c9d36e 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -13,7 +13,9 @@ #include #include #include +#include +#define OUTMAX (2048*10) // default hparams (GPT-J 6B) struct gptj_hparams { int32_t n_vocab = 50400; @@ -25,6 +27,8 @@ struct gptj_hparams { int32_t ftype = 1; }; +bool quiet=0; +bool silent=0; struct gptj_layer { // normalization struct ggml_tensor * ln_1_g; @@ -70,7 +74,7 @@ struct gptj_model { // load the model's weights from a file bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + if(!quiet) printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { @@ -100,13 +104,13 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: ftype = %d\n", __func__, hparams.ftype); + if(!quiet)printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + if(!quiet)printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + if(!quiet)printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + if(!quiet)printf("%s: n_head = %d\n", __func__, hparams.n_head); + if(!quiet)printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + if(!quiet)printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + if(!quiet)printf("%s: ftype = %d\n", __func__, hparams.ftype); } // load vocab @@ -182,7 +186,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & ctx_size += (5 + 10*n_layer)*256; // object overhead - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + if(!quiet)printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context @@ -280,7 +284,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + if(!quiet)printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } // load weights @@ -288,7 +292,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & int n_tensors = 0; size_t total_size = 0; - printf("%s: ", __func__); + if(!quiet)printf("%s: ", __func__); while (true) { int32_t n_dims; @@ -332,7 +336,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & // for debugging if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + if(!quiet)printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } const size_t bpe = ggml_type_size(ggml_type(ttype)); @@ -348,14 +352,14 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { - printf("."); + if(!quiet)printf("."); fflush(stdout); } } - printf(" done\n"); + if(!quiet)printf(" done\n"); - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + if(!quiet)printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } fin.close(); @@ -601,8 +605,10 @@ bool gptj_eval( int main(int argc, char ** argv) { const int64_t t_main_start_us = ggml_time_us(); - + char out[OUTMAX+1]; + regex_t regex; gpt_params params; + memset(out,0,OUTMAX); params.model = "models/gpt-j-6B/ggml-model.bin"; if (gpt_params_parse(argc, argv, params) == false) { @@ -613,7 +619,19 @@ int main(int argc, char ** argv) { params.seed = time(NULL); } - printf("%s: seed = %d\n", __func__, params.seed); + quiet=params.quiet; + silent=params.silent; + + if(!(params.regex.empty())) { + if(regcomp(®ex,params.regex.c_str(),0)){ + fprintf(stderr,"Bad regex '%s'",params.regex.c_str()); + exit(1); + } else { + if(!quiet)printf("regex = '%s'\n",params.regex.c_str()); + } + } + + if(!quiet)printf("%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.prompt.empty()) { @@ -656,8 +674,8 @@ int main(int argc, char ** argv) { params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - printf("\n"); + if(!quiet)printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + if(!quiet)printf("\n"); std::vector embd; @@ -701,10 +719,17 @@ int main(int argc, char ** argv) { // add it to the context embd.push_back(id); + //concat to out str + strncat(out,vocab.id_to_token[id].c_str(),OUTMAX); + //display text + printf("%s", vocab.id_to_token[id].c_str()); } else { // if here, it means we are still processing the input prompt for (int k = i; k < embd_inp.size(); k++) { embd.push_back(embd_inp[k]); + //display processed prompt text if not in silent mode + if(!silent) printf("%s", vocab.id_to_token[embd_inp[k]].c_str()); + fflush(stdout); if (embd.size() > params.n_batch) { break; } @@ -712,12 +737,13 @@ int main(int argc, char ** argv) { i += embd.size() - 1; } - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); + if(!params.regex.empty()) { + if(!regexec(®ex,out,0,NULL,0)){ + if(!quiet)printf("\nMatched reverse prompt.\n"); + break; + } } fflush(stdout); - // end of text token if (embd.back() == 50256) { break; @@ -728,12 +754,12 @@ int main(int argc, char ** argv) { { const int64_t t_main_end_us = ggml_time_us(); - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + if(!quiet)printf("\n\n"); + if(!quiet)printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + if(!quiet)printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + if(!quiet)printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + if(!quiet)printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + if(!quiet)printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } ggml_free(model.ctx);