1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 /* Implementation notes: 12 * 13 * This is a very simple lorem ipsum generator 14 * which features a static list of words 15 * and print them one after another randomly 16 * with a fake sentence / paragraph structure. 17 * 18 * The goal is to generate a printable text 19 * that can be used to fake a text compression scenario. 20 * The resulting compression / ratio curve of the lorem ipsum generator 21 * is more satisfying than the previous statistical generator, 22 * which was initially designed for entropy compression, 23 * and lacks a regularity more representative of text. 24 * 25 * The compression ratio achievable on the generated lorem ipsum 26 * is still a bit too good, presumably because the dictionary is a bit too 27 * small. It would be possible to create some more complex scheme, notably by 28 * enlarging the dictionary with a word generator, and adding grammatical rules 29 * (composition) and syntax rules. But that's probably overkill for the intended 30 * goal. 31 */ 32 33 #include "lorem.h" 34 #include <assert.h> 35 #include <limits.h> /* INT_MAX */ 36 #include <string.h> /* memcpy */ 37 38 #define WORD_MAX_SIZE 20 39 40 /* Define the word pool */ 41 static const char* kWords[] = { 42 "lorem", "ipsum", "dolor", "sit", "amet", 43 "consectetur", "adipiscing", "elit", "sed", "do", 44 "eiusmod", "tempor", "incididunt", "ut", "labore", 45 "et", "dolore", "magna", "aliqua", "dis", 46 "lectus", "vestibulum", "mattis", "ullamcorper", "velit", 47 "commodo", "a", "lacus", "arcu", "magnis", 48 "parturient", "montes", "nascetur", "ridiculus", "mus", 49 "mauris", "nulla", "malesuada", "pellentesque", "eget", 50 "gravida", "in", "dictum", "non", "erat", 51 "nam", "voluptat", "maecenas", "blandit", "aliquam", 52 "etiam", "enim", "lobortis", "scelerisque", "fermentum", 53 "dui", "faucibus", "ornare", "at", "elementum", 54 "eu", "facilisis", "odio", "morbi", "quis", 55 "eros", "donec", "ac", "orci", "purus", 56 "turpis", "cursus", "leo", "vel", "porta", 57 "consequat", "interdum", "varius", "vulputate", "aliquet", 58 "pharetra", "nunc", "auctor", "urna", "id", 59 "metus", "viverra", "nibh", "cras", "mi", 60 "unde", "omnis", "iste", "natus", "error", 61 "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium", 62 "totam", "rem", "aperiam", "eaque", "ipsa", 63 "quae", "ab", "illo", "inventore", "veritatis", 64 "quasi", "architecto", "beatae", "vitae", "dicta", 65 "sunt", "explicabo", "nemo", "ipsam", "quia", 66 "voluptas", "aspernatur", "aut", "odit", "fugit", 67 "consequuntur", "magni", "dolores", "eos", "qui", 68 "ratione", "sequi", "nesciunt", "neque", "porro", 69 "quisquam", "est", "dolorem", "adipisci", "numquam", 70 "eius", "modi", "tempora", "incidunt", "magnam", 71 "quaerat", "ad", "minima", "veniam", "nostrum", 72 "ullam", "corporis", "suscipit", "laboriosam", "nisi", 73 "aliquid", "ex", "ea", "commodi", "consequatur", 74 "autem", "eum", "iure", "voluptate", "esse", 75 "quam", "nihil", "molestiae", "illum", "fugiat", 76 "quo", "pariatur", "vero", "accusamus", "iusto", 77 "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum", 78 "deleniti", "atque", "corrupti", "quos", "quas", 79 "molestias", "excepturi", "sint", "occaecati", "cupiditate", 80 "provident", "similique", "culpa", "officia", "deserunt", 81 "mollitia", "animi", "laborum", "dolorum", "fuga", 82 "harum", "quidem", "rerum", "facilis", "expedita", 83 "distinctio", "libero", "tempore", "cum", "soluta", 84 "nobis", "eligendi", "optio", "cumque", "impedit", 85 "minus", "quod", "maxime", "placeat", "facere", 86 "possimus", "assumenda", "repellendus", "temporibus", "quibusdam", 87 "officiis", "debitis", "saepe", "eveniet", "voluptates", 88 "repudiandae", "recusandae", "itaque", "earum", "hic", 89 "tenetur", "sapiente", "delectus", "reiciendis", "cillum", 90 "maiores", "alias", "perferendis", "doloribus", "asperiores", 91 "repellat", "minim", "nostrud", "exercitation", "ullamco", 92 "laboris", "aliquip", "duis", "aute", "irure", 93 }; 94 static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]); 95 96 /* simple 1-dimension distribution, based on word's length, favors small words 97 */ 98 static const int kWeights[] = { 0, 8, 6, 4, 3, 2 }; 99 static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]); 100 101 #define DISTRIB_SIZE_MAX 650 102 static int g_distrib[DISTRIB_SIZE_MAX] = { 0 }; 103 static unsigned g_distribCount = 0; 104 105 static void countFreqs( 106 const char* words[], 107 size_t nbWords, 108 const int* weights, 109 size_t nbWeights) 110 { 111 unsigned total = 0; 112 size_t w; 113 for (w = 0; w < nbWords; w++) { 114 size_t len = strlen(words[w]); 115 int lmax; 116 if (len >= nbWeights) 117 len = nbWeights - 1; 118 lmax = weights[len]; 119 total += (unsigned)lmax; 120 } 121 g_distribCount = total; 122 assert(g_distribCount <= DISTRIB_SIZE_MAX); 123 } 124 125 static void init_word_distrib( 126 const char* words[], 127 size_t nbWords, 128 const int* weights, 129 size_t nbWeights) 130 { 131 size_t w, d = 0; 132 countFreqs(words, nbWords, weights, nbWeights); 133 for (w = 0; w < nbWords; w++) { 134 size_t len = strlen(words[w]); 135 int l, lmax; 136 if (len >= nbWeights) 137 len = nbWeights - 1; 138 lmax = weights[len]; 139 for (l = 0; l < lmax; l++) { 140 g_distrib[d++] = (int)w; 141 } 142 } 143 } 144 145 /* Note: this unit only works when invoked sequentially. 146 * No concurrent access is allowed */ 147 static char* g_ptr = NULL; 148 static size_t g_nbChars = 0; 149 static size_t g_maxChars = 10000000; 150 static unsigned g_randRoot = 0; 151 152 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) 153 static unsigned LOREM_rand(unsigned range) 154 { 155 static const unsigned prime1 = 2654435761U; 156 static const unsigned prime2 = 2246822519U; 157 unsigned rand32 = g_randRoot; 158 rand32 *= prime1; 159 rand32 ^= prime2; 160 rand32 = RDG_rotl32(rand32, 13); 161 g_randRoot = rand32; 162 return (unsigned)(((unsigned long long)rand32 * range) >> 32); 163 } 164 165 static void writeLastCharacters(void) 166 { 167 size_t lastChars = g_maxChars - g_nbChars; 168 assert(g_maxChars >= g_nbChars); 169 if (lastChars == 0) 170 return; 171 g_ptr[g_nbChars++] = '.'; 172 if (lastChars > 2) { 173 memset(g_ptr + g_nbChars, ' ', lastChars - 2); 174 } 175 if (lastChars > 1) { 176 g_ptr[g_maxChars - 1] = '\n'; 177 } 178 g_nbChars = g_maxChars; 179 } 180 181 static void generateWord(const char* word, const char* separator, int upCase) 182 { 183 size_t const len = strlen(word) + strlen(separator); 184 if (g_nbChars + len > g_maxChars) { 185 writeLastCharacters(); 186 return; 187 } 188 memcpy(g_ptr + g_nbChars, word, strlen(word)); 189 if (upCase) { 190 static const char toUp = 'A' - 'a'; 191 g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); 192 } 193 g_nbChars += strlen(word); 194 memcpy(g_ptr + g_nbChars, separator, strlen(separator)); 195 g_nbChars += strlen(separator); 196 } 197 198 static int about(unsigned target) 199 { 200 return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); 201 } 202 203 /* Function to generate a random sentence */ 204 static void generateSentence(int nbWords) 205 { 206 int commaPos = about(9); 207 int comma2 = commaPos + about(7); 208 int qmark = (LOREM_rand(11) == 7); 209 const char* endSep = qmark ? "? " : ". "; 210 int i; 211 for (i = 0; i < nbWords; i++) { 212 int const wordID = g_distrib[LOREM_rand(g_distribCount)]; 213 const char* const word = kWords[wordID]; 214 const char* sep = " "; 215 if (i == commaPos) 216 sep = ", "; 217 if (i == comma2) 218 sep = ", "; 219 if (i == nbWords - 1) 220 sep = endSep; 221 generateWord(word, sep, i == 0); 222 } 223 } 224 225 static void generateParagraph(int nbSentences) 226 { 227 int i; 228 for (i = 0; i < nbSentences; i++) { 229 int wordsPerSentence = about(11); 230 generateSentence(wordsPerSentence); 231 } 232 if (g_nbChars < g_maxChars) { 233 g_ptr[g_nbChars++] = '\n'; 234 } 235 if (g_nbChars < g_maxChars) { 236 g_ptr[g_nbChars++] = '\n'; 237 } 238 } 239 240 /* It's "common" for lorem ipsum generators to start with the same first 241 * pre-defined sentence */ 242 static void generateFirstSentence(void) 243 { 244 int i; 245 for (i = 0; i < 18; i++) { 246 const char* word = kWords[i]; 247 const char* separator = " "; 248 if (i == 4) 249 separator = ", "; 250 if (i == 7) 251 separator = ", "; 252 generateWord(word, separator, i == 0); 253 } 254 generateWord(kWords[18], ". ", 0); 255 } 256 257 size_t 258 LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill) 259 { 260 g_ptr = (char*)buffer; 261 assert(size < INT_MAX); 262 g_maxChars = size; 263 g_nbChars = 0; 264 g_randRoot = seed; 265 if (g_distribCount == 0) { 266 init_word_distrib(kWords, kNbWords, kWeights, kNbWeights); 267 } 268 269 if (first) { 270 generateFirstSentence(); 271 } 272 while (g_nbChars < g_maxChars) { 273 int sentencePerParagraph = about(7); 274 generateParagraph(sentencePerParagraph); 275 if (!fill) 276 break; /* only generate one paragraph in not-fill mode */ 277 } 278 g_ptr = NULL; 279 return g_nbChars; 280 } 281 282 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) 283 { 284 LOREM_genBlock(buffer, size, seed, 1, 1); 285 } 286