diff --git a/cutText.js b/cutText.js new file mode 100644 index 0000000..0e739c5 --- /dev/null +++ b/cutText.js @@ -0,0 +1,25 @@ +let Segment = require("segment"); +const segment = new Segment(); +const POSTAG = require("segment/lib/POSTAG") +const load = segment.useDefault() +/** + * + * @param {string} str + * @returns {Promise} + */ +async function cutText(str) { + await load; + let words = segment.doSegment(str, { + stripPunctuation: true + }); + + words = words.filter(v => { + return v.p != POSTAG.D_U && v.p != POSTAG.D_P && v.p != POSTAG.A_M && v.p != POSTAG.D_D + }).map(v => v.w) + // console.log(words) + return words +} + +module.exports = { cutText } + +// console.log(cutText("明天要去上学")) \ No newline at end of file diff --git a/src/word2vec.c b/src/word2vec.c index b9d3dc6..197f02f 100644 --- a/src/word2vec.c +++ b/src/word2vec.c @@ -17,7 +17,9 @@ #include #include #include - +#ifdef _WIN32 +#include +#endif #define MAX_STRING 100 #define EXP_TABLE_SIZE 1000 #define MAX_EXP 6 @@ -43,7 +45,7 @@ long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; real alpha = 0.025, starting_alpha, sample = 1e-3; real *syn0, *syn1, *syn1neg, *expTable; -clock_t start; +int start; int hs = 0, negative = 5; const int table_size = 1e8; @@ -335,27 +337,82 @@ void ReadVocab() { fclose(fin); } -void InitNet() { +void InitNet() +{ long long a, b; unsigned long long next_random = 1; - a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); - if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} - if (hs) { - a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); - if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} - for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) - syn1[a * layer1_size + b] = 0; + +#ifdef _WIN32 + syn0 = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); +#else + if (posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0) + { + syn0 = NULL; + } +#endif + if (syn0 == NULL) + { + printf("Memory allocation failed\n"); + exit(1); + } + + if (hs) + { +#ifdef _WIN32 + syn1 = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); +#else + if (posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0) + { + syn1 = NULL; + } +#endif + if (syn1 == NULL) + { + printf("Memory allocation failed\n"); + exit(1); + } + for (a = 0; a < vocab_size; a++) + { + for (b = 0; b < layer1_size; b++) + { + syn1[a * layer1_size + b] = 0; + } + } } - if (negative>0) { - a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); - if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} - for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) - syn1neg[a * layer1_size + b] = 0; + + if (negative > 0) + { +#ifdef _WIN32 + syn1neg = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); +#else + if (posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0) + { + syn1neg = NULL; + } +#endif + if (syn1neg == NULL) + { + printf("Memory allocation failed\n"); + exit(1); + } + for (a = 0; a < vocab_size; a++) + { + for (b = 0; b < layer1_size; b++) + { + syn1neg[a * layer1_size + b] = 0; + } + } } - for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { + + for (a = 0; a < vocab_size; a++) + { + for (b = 0; b < layer1_size; b++) + { next_random = next_random * (unsigned long long)25214903917 + 11; syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; + } } + CreateBinaryTree(); } @@ -365,7 +422,7 @@ void *TrainModelThread(void *id) { long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; - clock_t now; + int now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); @@ -376,9 +433,9 @@ void *TrainModelThread(void *id) { last_word_count = word_count; if ((debug_mode > 1)) { now=clock(); - printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, - word_count_actual / (real)(iter * train_words + 1) * 100, - word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); + // printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, + // word_count_actual / (real)(iter * train_words + 1) * 100, + // word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); diff --git a/test/test1.js b/test/test1.js new file mode 100644 index 0000000..2e46cfe --- /dev/null +++ b/test/test1.js @@ -0,0 +1,61 @@ +const word2vec = require('../'); + +var w2v = require('./../lib'); + +const fs = require('fs'); +const path = require('path'); +const { cutText } = require('../cutText'); + +w2v.word2phrase('E:/web/wordApp/tmp/遮天.txt', __dirname + '/fixtures/phrases1.txt', { + threshold: 5, + debug: 2, + minCount: 2 +}); + +var out = fs.createWriteStream(path.join(__dirname, "/fixtures/cut.txt"), { flags: "w+" }); + + +readLine(path.join(__dirname + '/fixtures/phrases1.txt'), async function (params) { + var result = await cutText(params); + out.write(result.join(" ") + "\n"); +}) + + +// // // 训练Word2Vec模型 +// word2vec.word2vec( "E:/web/wordApp/tmp/out.txt", './output_model.txt', { +// size: 200, +// window: 5, +// minCount: 4, +// threshold: 90, +// cbow: 1 +// }, (error) => { +// if (error) { +// console.error(error); +// return; +// } + +// console.log('模型训练完成'); +// }); + + + +async function readLine(filePath, insert) { + const fileStream = fs.createReadStream(filePath); + const lineReader = require('readline').createInterface({ + input: fileStream, + crlfDelay: Infinity + }); + + let lines = []; + var i = 0; + for await (const fileLine of lineReader) { + await insert(fileLine); + i++; + if (i % 1000 == 0) { + // break;/ + console.log(i); + } + + } + return lines; +}