1*3117ece4Schristos /* 2*3117ece4Schristos * Copyright (c) Meta Platforms, Inc. and affiliates. 3*3117ece4Schristos * All rights reserved. 4*3117ece4Schristos * 5*3117ece4Schristos * This source code is licensed under both the BSD-style license (found in the 6*3117ece4Schristos * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*3117ece4Schristos * in the COPYING file in the root directory of this source tree). 8*3117ece4Schristos * You may select, at your option, one of the above-listed licenses. 9*3117ece4Schristos */ 10*3117ece4Schristos 11*3117ece4Schristos /* Implementation notes: 12*3117ece4Schristos * 13*3117ece4Schristos * This is a very simple lorem ipsum generator 14*3117ece4Schristos * which features a static list of words 15*3117ece4Schristos * and print them one after another randomly 16*3117ece4Schristos * with a fake sentence / paragraph structure. 17*3117ece4Schristos * 18*3117ece4Schristos * The goal is to generate a printable text 19*3117ece4Schristos * that can be used to fake a text compression scenario. 20*3117ece4Schristos * The resulting compression / ratio curve of the lorem ipsum generator 21*3117ece4Schristos * is more satisfying than the previous statistical generator, 22*3117ece4Schristos * which was initially designed for entropy compression, 23*3117ece4Schristos * and lacks a regularity more representative of text. 24*3117ece4Schristos * 25*3117ece4Schristos * The compression ratio achievable on the generated lorem ipsum 26*3117ece4Schristos * is still a bit too good, presumably because the dictionary is a bit too 27*3117ece4Schristos * small. It would be possible to create some more complex scheme, notably by 28*3117ece4Schristos * enlarging the dictionary with a word generator, and adding grammatical rules 29*3117ece4Schristos * (composition) and syntax rules. But that's probably overkill for the intended 30*3117ece4Schristos * goal. 31*3117ece4Schristos */ 32*3117ece4Schristos 33*3117ece4Schristos #include "lorem.h" 34*3117ece4Schristos #include <assert.h> 35*3117ece4Schristos #include <limits.h> /* INT_MAX */ 36*3117ece4Schristos #include <string.h> /* memcpy */ 37*3117ece4Schristos 38*3117ece4Schristos #define WORD_MAX_SIZE 20 39*3117ece4Schristos 40*3117ece4Schristos /* Define the word pool */ 41*3117ece4Schristos static const char* kWords[] = { 42*3117ece4Schristos "lorem", "ipsum", "dolor", "sit", "amet", 43*3117ece4Schristos "consectetur", "adipiscing", "elit", "sed", "do", 44*3117ece4Schristos "eiusmod", "tempor", "incididunt", "ut", "labore", 45*3117ece4Schristos "et", "dolore", "magna", "aliqua", "dis", 46*3117ece4Schristos "lectus", "vestibulum", "mattis", "ullamcorper", "velit", 47*3117ece4Schristos "commodo", "a", "lacus", "arcu", "magnis", 48*3117ece4Schristos "parturient", "montes", "nascetur", "ridiculus", "mus", 49*3117ece4Schristos "mauris", "nulla", "malesuada", "pellentesque", "eget", 50*3117ece4Schristos "gravida", "in", "dictum", "non", "erat", 51*3117ece4Schristos "nam", "voluptat", "maecenas", "blandit", "aliquam", 52*3117ece4Schristos "etiam", "enim", "lobortis", "scelerisque", "fermentum", 53*3117ece4Schristos "dui", "faucibus", "ornare", "at", "elementum", 54*3117ece4Schristos "eu", "facilisis", "odio", "morbi", "quis", 55*3117ece4Schristos "eros", "donec", "ac", "orci", "purus", 56*3117ece4Schristos "turpis", "cursus", "leo", "vel", "porta", 57*3117ece4Schristos "consequat", "interdum", "varius", "vulputate", "aliquet", 58*3117ece4Schristos "pharetra", "nunc", "auctor", "urna", "id", 59*3117ece4Schristos "metus", "viverra", "nibh", "cras", "mi", 60*3117ece4Schristos "unde", "omnis", "iste", "natus", "error", 61*3117ece4Schristos "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium", 62*3117ece4Schristos "totam", "rem", "aperiam", "eaque", "ipsa", 63*3117ece4Schristos "quae", "ab", "illo", "inventore", "veritatis", 64*3117ece4Schristos "quasi", "architecto", "beatae", "vitae", "dicta", 65*3117ece4Schristos "sunt", "explicabo", "nemo", "ipsam", "quia", 66*3117ece4Schristos "voluptas", "aspernatur", "aut", "odit", "fugit", 67*3117ece4Schristos "consequuntur", "magni", "dolores", "eos", "qui", 68*3117ece4Schristos "ratione", "sequi", "nesciunt", "neque", "porro", 69*3117ece4Schristos "quisquam", "est", "dolorem", "adipisci", "numquam", 70*3117ece4Schristos "eius", "modi", "tempora", "incidunt", "magnam", 71*3117ece4Schristos "quaerat", "ad", "minima", "veniam", "nostrum", 72*3117ece4Schristos "ullam", "corporis", "suscipit", "laboriosam", "nisi", 73*3117ece4Schristos "aliquid", "ex", "ea", "commodi", "consequatur", 74*3117ece4Schristos "autem", "eum", "iure", "voluptate", "esse", 75*3117ece4Schristos "quam", "nihil", "molestiae", "illum", "fugiat", 76*3117ece4Schristos "quo", "pariatur", "vero", "accusamus", "iusto", 77*3117ece4Schristos "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum", 78*3117ece4Schristos "deleniti", "atque", "corrupti", "quos", "quas", 79*3117ece4Schristos "molestias", "excepturi", "sint", "occaecati", "cupiditate", 80*3117ece4Schristos "provident", "similique", "culpa", "officia", "deserunt", 81*3117ece4Schristos "mollitia", "animi", "laborum", "dolorum", "fuga", 82*3117ece4Schristos "harum", "quidem", "rerum", "facilis", "expedita", 83*3117ece4Schristos "distinctio", "libero", "tempore", "cum", "soluta", 84*3117ece4Schristos "nobis", "eligendi", "optio", "cumque", "impedit", 85*3117ece4Schristos "minus", "quod", "maxime", "placeat", "facere", 86*3117ece4Schristos "possimus", "assumenda", "repellendus", "temporibus", "quibusdam", 87*3117ece4Schristos "officiis", "debitis", "saepe", "eveniet", "voluptates", 88*3117ece4Schristos "repudiandae", "recusandae", "itaque", "earum", "hic", 89*3117ece4Schristos "tenetur", "sapiente", "delectus", "reiciendis", "cillum", 90*3117ece4Schristos "maiores", "alias", "perferendis", "doloribus", "asperiores", 91*3117ece4Schristos "repellat", "minim", "nostrud", "exercitation", "ullamco", 92*3117ece4Schristos "laboris", "aliquip", "duis", "aute", "irure", 93*3117ece4Schristos }; 94*3117ece4Schristos static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]); 95*3117ece4Schristos 96*3117ece4Schristos /* simple 1-dimension distribution, based on word's length, favors small words 97*3117ece4Schristos */ 98*3117ece4Schristos static const int kWeights[] = { 0, 8, 6, 4, 3, 2 }; 99*3117ece4Schristos static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]); 100*3117ece4Schristos 101*3117ece4Schristos #define DISTRIB_SIZE_MAX 650 102*3117ece4Schristos static int g_distrib[DISTRIB_SIZE_MAX] = { 0 }; 103*3117ece4Schristos static unsigned g_distribCount = 0; 104*3117ece4Schristos 105*3117ece4Schristos static void countFreqs( 106*3117ece4Schristos const char* words[], 107*3117ece4Schristos size_t nbWords, 108*3117ece4Schristos const int* weights, 109*3117ece4Schristos size_t nbWeights) 110*3117ece4Schristos { 111*3117ece4Schristos unsigned total = 0; 112*3117ece4Schristos size_t w; 113*3117ece4Schristos for (w = 0; w < nbWords; w++) { 114*3117ece4Schristos size_t len = strlen(words[w]); 115*3117ece4Schristos int lmax; 116*3117ece4Schristos if (len >= nbWeights) 117*3117ece4Schristos len = nbWeights - 1; 118*3117ece4Schristos lmax = weights[len]; 119*3117ece4Schristos total += (unsigned)lmax; 120*3117ece4Schristos } 121*3117ece4Schristos g_distribCount = total; 122*3117ece4Schristos assert(g_distribCount <= DISTRIB_SIZE_MAX); 123*3117ece4Schristos } 124*3117ece4Schristos 125*3117ece4Schristos static void init_word_distrib( 126*3117ece4Schristos const char* words[], 127*3117ece4Schristos size_t nbWords, 128*3117ece4Schristos const int* weights, 129*3117ece4Schristos size_t nbWeights) 130*3117ece4Schristos { 131*3117ece4Schristos size_t w, d = 0; 132*3117ece4Schristos countFreqs(words, nbWords, weights, nbWeights); 133*3117ece4Schristos for (w = 0; w < nbWords; w++) { 134*3117ece4Schristos size_t len = strlen(words[w]); 135*3117ece4Schristos int l, lmax; 136*3117ece4Schristos if (len >= nbWeights) 137*3117ece4Schristos len = nbWeights - 1; 138*3117ece4Schristos lmax = weights[len]; 139*3117ece4Schristos for (l = 0; l < lmax; l++) { 140*3117ece4Schristos g_distrib[d++] = (int)w; 141*3117ece4Schristos } 142*3117ece4Schristos } 143*3117ece4Schristos } 144*3117ece4Schristos 145*3117ece4Schristos /* Note: this unit only works when invoked sequentially. 146*3117ece4Schristos * No concurrent access is allowed */ 147*3117ece4Schristos static char* g_ptr = NULL; 148*3117ece4Schristos static size_t g_nbChars = 0; 149*3117ece4Schristos static size_t g_maxChars = 10000000; 150*3117ece4Schristos static unsigned g_randRoot = 0; 151*3117ece4Schristos 152*3117ece4Schristos #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) 153*3117ece4Schristos static unsigned LOREM_rand(unsigned range) 154*3117ece4Schristos { 155*3117ece4Schristos static const unsigned prime1 = 2654435761U; 156*3117ece4Schristos static const unsigned prime2 = 2246822519U; 157*3117ece4Schristos unsigned rand32 = g_randRoot; 158*3117ece4Schristos rand32 *= prime1; 159*3117ece4Schristos rand32 ^= prime2; 160*3117ece4Schristos rand32 = RDG_rotl32(rand32, 13); 161*3117ece4Schristos g_randRoot = rand32; 162*3117ece4Schristos return (unsigned)(((unsigned long long)rand32 * range) >> 32); 163*3117ece4Schristos } 164*3117ece4Schristos 165*3117ece4Schristos static void writeLastCharacters(void) 166*3117ece4Schristos { 167*3117ece4Schristos size_t lastChars = g_maxChars - g_nbChars; 168*3117ece4Schristos assert(g_maxChars >= g_nbChars); 169*3117ece4Schristos if (lastChars == 0) 170*3117ece4Schristos return; 171*3117ece4Schristos g_ptr[g_nbChars++] = '.'; 172*3117ece4Schristos if (lastChars > 2) { 173*3117ece4Schristos memset(g_ptr + g_nbChars, ' ', lastChars - 2); 174*3117ece4Schristos } 175*3117ece4Schristos if (lastChars > 1) { 176*3117ece4Schristos g_ptr[g_maxChars - 1] = '\n'; 177*3117ece4Schristos } 178*3117ece4Schristos g_nbChars = g_maxChars; 179*3117ece4Schristos } 180*3117ece4Schristos 181*3117ece4Schristos static void generateWord(const char* word, const char* separator, int upCase) 182*3117ece4Schristos { 183*3117ece4Schristos size_t const len = strlen(word) + strlen(separator); 184*3117ece4Schristos if (g_nbChars + len > g_maxChars) { 185*3117ece4Schristos writeLastCharacters(); 186*3117ece4Schristos return; 187*3117ece4Schristos } 188*3117ece4Schristos memcpy(g_ptr + g_nbChars, word, strlen(word)); 189*3117ece4Schristos if (upCase) { 190*3117ece4Schristos static const char toUp = 'A' - 'a'; 191*3117ece4Schristos g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); 192*3117ece4Schristos } 193*3117ece4Schristos g_nbChars += strlen(word); 194*3117ece4Schristos memcpy(g_ptr + g_nbChars, separator, strlen(separator)); 195*3117ece4Schristos g_nbChars += strlen(separator); 196*3117ece4Schristos } 197*3117ece4Schristos 198*3117ece4Schristos static int about(unsigned target) 199*3117ece4Schristos { 200*3117ece4Schristos return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); 201*3117ece4Schristos } 202*3117ece4Schristos 203*3117ece4Schristos /* Function to generate a random sentence */ 204*3117ece4Schristos static void generateSentence(int nbWords) 205*3117ece4Schristos { 206*3117ece4Schristos int commaPos = about(9); 207*3117ece4Schristos int comma2 = commaPos + about(7); 208*3117ece4Schristos int qmark = (LOREM_rand(11) == 7); 209*3117ece4Schristos const char* endSep = qmark ? "? " : ". "; 210*3117ece4Schristos int i; 211*3117ece4Schristos for (i = 0; i < nbWords; i++) { 212*3117ece4Schristos int const wordID = g_distrib[LOREM_rand(g_distribCount)]; 213*3117ece4Schristos const char* const word = kWords[wordID]; 214*3117ece4Schristos const char* sep = " "; 215*3117ece4Schristos if (i == commaPos) 216*3117ece4Schristos sep = ", "; 217*3117ece4Schristos if (i == comma2) 218*3117ece4Schristos sep = ", "; 219*3117ece4Schristos if (i == nbWords - 1) 220*3117ece4Schristos sep = endSep; 221*3117ece4Schristos generateWord(word, sep, i == 0); 222*3117ece4Schristos } 223*3117ece4Schristos } 224*3117ece4Schristos 225*3117ece4Schristos static void generateParagraph(int nbSentences) 226*3117ece4Schristos { 227*3117ece4Schristos int i; 228*3117ece4Schristos for (i = 0; i < nbSentences; i++) { 229*3117ece4Schristos int wordsPerSentence = about(11); 230*3117ece4Schristos generateSentence(wordsPerSentence); 231*3117ece4Schristos } 232*3117ece4Schristos if (g_nbChars < g_maxChars) { 233*3117ece4Schristos g_ptr[g_nbChars++] = '\n'; 234*3117ece4Schristos } 235*3117ece4Schristos if (g_nbChars < g_maxChars) { 236*3117ece4Schristos g_ptr[g_nbChars++] = '\n'; 237*3117ece4Schristos } 238*3117ece4Schristos } 239*3117ece4Schristos 240*3117ece4Schristos /* It's "common" for lorem ipsum generators to start with the same first 241*3117ece4Schristos * pre-defined sentence */ 242*3117ece4Schristos static void generateFirstSentence(void) 243*3117ece4Schristos { 244*3117ece4Schristos int i; 245*3117ece4Schristos for (i = 0; i < 18; i++) { 246*3117ece4Schristos const char* word = kWords[i]; 247*3117ece4Schristos const char* separator = " "; 248*3117ece4Schristos if (i == 4) 249*3117ece4Schristos separator = ", "; 250*3117ece4Schristos if (i == 7) 251*3117ece4Schristos separator = ", "; 252*3117ece4Schristos generateWord(word, separator, i == 0); 253*3117ece4Schristos } 254*3117ece4Schristos generateWord(kWords[18], ". ", 0); 255*3117ece4Schristos } 256*3117ece4Schristos 257*3117ece4Schristos size_t 258*3117ece4Schristos LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill) 259*3117ece4Schristos { 260*3117ece4Schristos g_ptr = (char*)buffer; 261*3117ece4Schristos assert(size < INT_MAX); 262*3117ece4Schristos g_maxChars = size; 263*3117ece4Schristos g_nbChars = 0; 264*3117ece4Schristos g_randRoot = seed; 265*3117ece4Schristos if (g_distribCount == 0) { 266*3117ece4Schristos init_word_distrib(kWords, kNbWords, kWeights, kNbWeights); 267*3117ece4Schristos } 268*3117ece4Schristos 269*3117ece4Schristos if (first) { 270*3117ece4Schristos generateFirstSentence(); 271*3117ece4Schristos } 272*3117ece4Schristos while (g_nbChars < g_maxChars) { 273*3117ece4Schristos int sentencePerParagraph = about(7); 274*3117ece4Schristos generateParagraph(sentencePerParagraph); 275*3117ece4Schristos if (!fill) 276*3117ece4Schristos break; /* only generate one paragraph in not-fill mode */ 277*3117ece4Schristos } 278*3117ece4Schristos g_ptr = NULL; 279*3117ece4Schristos return g_nbChars; 280*3117ece4Schristos } 281*3117ece4Schristos 282*3117ece4Schristos void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) 283*3117ece4Schristos { 284*3117ece4Schristos LOREM_genBlock(buffer, size, seed, 1, 1); 285*3117ece4Schristos } 286