/** * execute like this (from the project root folder): * node build/build.js */ var fs = require('fs'); var beautify = require('js-beautify').js_beautify; var UglifyJS = require("uglify-js"); // shortcut for minifying a piece of code function compress(orig_code) { return UglifyJS.minify(orig_code, { fromString: true, comments: true }).code; } // take some of the stop words list from the stopwords-filter repo var stopwordsRepoFolder = './stopwords-filter/lib/stopwords/snowball/locales/'; // and, since that repository does not include all the stopwords we want, we add more, custom stopwords lists var stopwordsCustomFolder = './stopwords-custom/'; // Use the Unicode library to produce a regex for characters of a particular // 'script' (such as Latin), then extract the character ranges from that // regex for use in our trimmer function wordCharacters(script) { var charRegex = require('unicode-8.0.0/scripts/' + script + '/regex'); // Now from /[a-z]/ get "a-z" var regexString = charRegex.toString() // Format sanity check if (regexString.slice(0, 2) !== '/[' || regexString.slice(-2) != ']/') { console.error('Unexpected regex structure, aborting: ' + regexString); throw Error; } return regexString.slice(2, -2); } // list mapping between locale, stemmer file, stopwords file, and char pattern var list = [ { locale: 'ar', }, { locale: 'hi' }, { locale: 'da', file: 'DanishStemmer.js', stopwords: stopwordsRepoFolder + 'da.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'nl', file: 'DutchStemmer.js', stopwords: stopwordsRepoFolder + 'nl.csv', wordCharacters: wordCharacters('Latin') }, { /* Kept here to prevent breaking changes. The correct code for Dutch is NL. Please do not use "du" anymore, start using "nl". I will remove "du" next time I'll build a major, backward incompatible package */ locale: 'du', file: 'DutchStemmer.js', stopwords: stopwordsRepoFolder + 'nl.csv', wordCharacters: wordCharacters('Latin'), warningMessage: '[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.' }, { locale: 'fi', file: 'FinnishStemmer.js', stopwords: stopwordsRepoFolder + 'fn.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'fr', file: 'FrenchStemmer.js', stopwords: stopwordsRepoFolder + 'fr.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'de', file: 'GermanStemmer.js', stopwords: stopwordsRepoFolder + 'de.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'hu', file: 'HungarianStemmer.js', stopwords: stopwordsRepoFolder + 'hu.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'it', file: 'ItalianStemmer.js', stopwords: stopwordsRepoFolder + 'it.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'ja' }, { locale: 'jp' }, { locale: 'kn' }, { locale: 'no', file: 'NorwegianStemmer.js', stopwords: stopwordsCustomFolder + 'no.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'pt', file: 'PortugueseStemmer.js', stopwords: stopwordsRepoFolder + 'pt.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'ro', file: 'RomanianStemmer.js', stopwords: stopwordsCustomFolder + 'ro.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'ru', file: 'RussianStemmer.js', stopwords: stopwordsCustomFolder + 'ru.csv', wordCharacters: wordCharacters('Cyrillic') }, { locale: 'es', file: 'SpanishStemmer.js', stopwords: stopwordsRepoFolder + 'es.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'sa' }, { locale: 'sv', file: 'SwedishStemmer.js', stopwords: stopwordsCustomFolder + 'sv.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'ta', }, { locale: 'te' }, { locale: 'tr', file: 'TurkishStemmer.js', stopwords: stopwordsCustomFolder + 'tr.csv', wordCharacters: wordCharacters('Latin') }, { locale: 'th', }, { locale: 'vi', }, { locale: 'zh', }, { locale: 'ko', }, { locale: 'hy', }, { locale: 'he', }, { locale: 'el', } ]; console.log('Starting building lunr-languages ...'); // read templates var tpl = fs.readFileSync('build/lunr.template', 'utf8'); var cm = fs.readFileSync('build/lunr.comments', 'utf8'); // for each language, start building for (var i = 0; i < list.length; i++) { console.log('Building for "' + list[i].locale + '"'); var data; var stopWords; var f; var fromTemplate = list[i].file && list[i].stopwords; if (fromTemplate) { data = fs.readFileSync('build/snowball-js/stemmer/src/ext/' + list[i].file, 'utf8'); stopWords = fs.readFileSync('build/' + list[i].stopwords, 'utf8'); // start replacing the placeholders f = tpl; f = cm + f; f = f.replace(/\{\{locale\}\}/g, list[i].locale); f = f.replace(/\{\{stemmerFunction\}\}/g, data.substring(data.indexOf('function'))); f = f.replace(/\{\{stopWords\}\}/g, stopWords.split(',').sort().join(' ')); f = f.replace(/\{\{stopWordsLength\}\}/g, stopWords.split(',').length + 1); f = f.replace(/\{\{languageName\}\}/g, list[i].file.replace(/Stemmer\.js/g, '')); f = f.replace(/\{\{wordCharacters\}\}/g, list[i].wordCharacters); f = f.replace(/\{\{consoleWarning\}\}/g, list[i].warningMessage ? '\n\nconsole.warn(' + JSON.stringify(list[i].warningMessage) + ');' : ''); } else { // beautify andminify languages not generated from the template. f = fs.readFileSync('lunr.' + list[i].locale + '.js', 'utf8'); } // write the full file fs.writeFileSync('lunr.' + list[i].locale + '.js', beautify(f, { indent_size: 2 })); // and the minified version fs.writeFileSync('min/lunr.' + list[i].locale + '.min.js', fromTemplate ? cm.replace(/\{\{languageName\}\}/g, list[i].file.replace(/Stemmer\.js/g, '')) + compress(f) : compress(f) ); } console.log('Building Stemmer Support'); // build stemmer support var support = fs.readFileSync('lunr.stemmer.support.js', 'utf8'); fs.writeFileSync('min/lunr.stemmer.support.min.js', compress(support)); console.log('Building Multi-Language Extension'); // build multi var multi = fs.readFileSync('lunr.multi.js', 'utf8'); fs.writeFileSync('min/lunr.multi.min.js', compress(multi)); console.log('Done!');