1. 1 : import Load from './load';
  2. 2 : import Util from './util.js';
  3. 3 : import Categories from './categories.js';
  4. 4 :
  5. 5 :
  6. 6 : // this is essentially a private method to determine if we're in corpus or documents mode.
  7. 7 : // if docIndex or docId is defined, or if mode=="documents" then we're in documents mode
  8. 8 : function isDocumentsMode(config={}) {
  9. 9 : return 'docIndex' in config || 'docId' in config || ('mode' in config && config.mode==='documents');
  10. 10 : }
  11. 11 :
  12. 12 : /**
  13. 13 : * The Corpus class in Spyral. To get started you first need to load a Corpus, using either a
  14. 14 : * pre-existing Corpus id, or some input data. For this you can use the [loadCorpus]{@link window.loadCorpus}
  15. 15 : * method, which is an alias of {@link Spyral.Corpus.load}.
  16. 16 : *
  17. 17 : * Here's a simple example:
  18. 18 : *
  19. 19 : * loadCorpus("Hello World!").summary();
  20. 20 : *
  21. 21 : * This loads a corpus and returns an asynchronous `Promise`, but all of the methods
  22. 22 : * of Corpus are appended to the Promise, so {@link Spyral.Corpus#summary} will be called
  23. 23 : * once the Corpus promise is fulfilled. It's equivalent to the following:
  24. 24 : *
  25. 25 : * loadCorpus("Hello World!").then(corpus -> corpus.summary());
  26. 26 : *
  27. 27 : * Have a look at the {@link Spyral.Corpus~CorpusConfig} configuration for more examples.
  28. 28 : *
  29. 29 : * @memberof Spyral
  30. 30 : * @class
  31. 31 : */
  32. 32 : class Corpus {
  33. 33 :
  34. 34 : /**
  35. 35 : * The Corpus config
  36. 36 : * @typedef {Object|String} Spyral.Corpus~CorpusConfig
  37. 37 : *
  38. 38 : * @property {String} corpus The ID of a previously created corpus.
  39. 39 : *
  40. 40 : * A corpus ID can be used to try to retrieve a corpus that has been previously created.
  41. 41 : * Typically the corpus ID is used as a first string argument, with an optional second
  42. 42 : * argument for other parameters (especially those to recreate the corpus if needed).
  43. 43 : *
  44. 44 : * loadCorpus("goldbug");
  45. 45 : *
  46. 46 : * loadCorpus("goldbug", {
  47. 47 : * // if corpus ID "goldbug" isn't found, use the input
  48. 48 : * input: "https://gist.githubusercontent.com/sgsinclair/84c9da05e9e142af30779cc91440e8c1/raw/goldbug.txt",
  49. 49 : * inputRemoveUntil: 'THE GOLD-BUG',
  50. 50 : * inputRemoveFrom: 'FOUR BEASTS IN ONE'
  51. 51 : * });
  52. 52 : *
  53. 53 : * @property {(String|String[])} input Input sources for the corpus.
  54. 54 : *
  55. 55 : * The input sources can be either normal text or URLs (starting with `http`).
  56. 56 : *
  57. 57 : * Typically input sources are specified as a string or an array in the first argument, with an optional second argument for other parameters.
  58. 58 : *
  59. 59 : * loadCorpus("Hello Voyant!"); // one document with this string
  60. 60 : *
  61. 61 : * loadCorpus(["Hello Voyant!", "How are you?"]); // two documents with these strings
  62. 62 : *
  63. 63 : * loadCorpus("http://hermeneuti.ca/"); // one document from URL
  64. 64 : *
  65. 65 : * loadCorpus(["http://hermeneuti.ca/", "https://en.wikipedia.org/wiki/Voyant_Tools"]); // two documents from URLs
  66. 66 : *
  67. 67 : * loadCorpus("Hello Voyant!", "http://hermeneuti.ca/"]); // two documents, one from string and one from URL
  68. 68 : *
  69. 69 : * loadCorpus("https://gist.githubusercontent.com/sgsinclair/84c9da05e9e142af30779cc91440e8c1/raw/goldbug.txt", {
  70. 70 : * inputRemoveUntil: 'THE GOLD-BUG',
  71. 71 : * inputRemoveFrom: 'FOUR BEASTS IN ONE'
  72. 72 : * });
  73. 73 : *
  74. 74 : * // use a corpus ID but also specify an input source if the corpus can't be found
  75. 75 : * loadCorpus("goldbug", {
  76. 76 : * input: "https://gist.githubusercontent.com/sgsinclair/84c9da05e9e142af30779cc91440e8c1/raw/goldbug.txt",
  77. 77 : * inputRemoveUntil: 'THE GOLD-BUG',
  78. 78 : * inputRemoveFrom: 'FOUR BEASTS IN ONE'
  79. 79 : * });
  80. 80 : *
  81. 81 : * @property {String} inputFormat The input format of the corpus (the default is to auto-detect).
  82. 82 : *
  83. 83 : * The auto-detect format is usually reliable and inputFormat should only be used if the default
  84. 84 : * behaviour isn't desired. Most of the relevant values are used for XML documents:
  85. 85 : *
  86. 86 : * - **DTOC**: Dynamic Table of Contexts XML format
  87. 87 : * - **HTML**: Hypertext Markup Language
  88. 88 : * - **RSS**: Really Simple Syndication XML format
  89. 89 : * - **TEI**: Text Encoding Initiative XML format
  90. 90 : * - **TEICORPUS**: Text Encoding Initiative Corpus XML format
  91. 91 : * - **TEXT**: plain text
  92. 92 : * - **XML**: treat the document as XML (sometimes overridding auto-detect of XML vocabularies like RSS and TEI)
  93. 93 : *
  94. 94 : * Other formats include **PDF**, **MSWORD**, **XLSX**, **RTF**, **ODT**, and **ZIP** (but again, these rarely need to be specified).
  95. 95 : *
  96. 96 : * @property {String} tableDocuments Determine what is a document in a table (the entire table, by row, by column); only used for table-based documents.
  97. 97 : *
  98. 98 : * Possible values are:
  99. 99 : *
  100. 100 : * - **undefined or blank** (default): the entire table is one document
  101. 101 : * - **rows**: each row of the table is a separate document
  102. 102 : * - **columns**: each column of the table is a separate document
  103. 103 : *
  104. 104 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  105. 105 : *
  106. 106 : * @property {String} tableContent Determine how to extract body content from the table; only used for table-based documents.
  107. 107 : *
  108. 108 : * Columns are referred to by numbers, the first is column 1 (not 0).
  109. 109 : * You can specify separate columns by using a comma or you can combined the contents of columns/cells by using a plus sign.
  110. 110 : *
  111. 111 : * Some examples:
  112. 112 : *
  113. 113 : * - **1**: use column 1
  114. 114 : * - **1,2**: use columns 1 and 2 separately
  115. 115 : * - **1+2,3**: combine columns 1 and two and use column 3 separately
  116. 116 : *
  117. 117 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  118. 118 : *
  119. 119 : * @property {String} tableAuthor Determine how to extract the author from each document; only used for table-based documents.
  120. 120 : *
  121. 121 : * Columns are referred to by numbers, the first is column 1 (not 0).
  122. 122 : * You can specify separate columns by using a comma or you can combined the contents of columns/cells by using a plus sign.
  123. 123 : *
  124. 124 : * Some examples:
  125. 125 : *
  126. 126 : * - **1**: use column 1
  127. 127 : * - **1,2**: use columns 1 and 2 separately
  128. 128 : * - **1+2,3**: combine columns 1 and two and use column 3 separately
  129. 129 : *
  130. 130 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  131. 131 : *
  132. 132 : * @property {String} tableTitle Determine how to extract the title from each document; only used for table-based documents.
  133. 133 : *
  134. 134 : * Columns are referred to by numbers, the first is column 1 (not 0).
  135. 135 : * You can specify separate columns by using a comma or you can combined the contents of columns/cells by using a plus sign.
  136. 136 : *
  137. 137 : * Some examples:
  138. 138 : *
  139. 139 : * - **1**: use column 1
  140. 140 : * - **1,2**: use columns 1 and 2 separately
  141. 141 : * - **1+2,3**: combine columns 1 and two and use column 3 separately
  142. 142 : *
  143. 143 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  144. 144 : *
  145. 145 : * @property {String} tableGroupBy Specify a column (or columns) by which to group documents; only used for table-based documents, in rows mode.
  146. 146 : *
  147. 147 : * Columns are referred to by numbers, the first is column 1 (not 0).
  148. 148 : * You can specify separate columns by using a comma or you can combined the contents of columns/cells by using a plus sign.
  149. 149 : *
  150. 150 : * Some examples:
  151. 151 : *
  152. 152 : * - **1**: use column 1
  153. 153 : * - **1,2**: use columns 1 and 2 separately
  154. 154 : * - **1+2,3**: combine columns 1 and two and use column 3 separately
  155. 155 : *
  156. 156 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  157. 157 : *
  158. 158 : * @property {String} tableNoHeadersRow Determine if the table has a first row of headers; only used for table-based documents.
  159. 159 : *
  160. 160 : * Provide a value of "true" if there is no header row, otherwise leave it blank or undefined (default).
  161. 161 : *
  162. 162 : * See also [Creating a Corpus with Tables](tutorial-corpuscreator.html#tables).
  163. 163 : *
  164. 164 : * @property {String} tokenization The tokenization strategy to use
  165. 165 : *
  166. 166 : * This should usually be undefined, unless specific behaviour is required. These are the valid values:
  167. 167 : *
  168. 168 : * - **undefined or blank**: use the default tokenization (which uses Unicode rules for word segmentation)
  169. 169 : * - **wordBoundaries**: use any Unicode character word boundaries for tokenization
  170. 170 : * - **whitespace**: tokenize by whitespace only (punctuation and other characters will be kept with words)
  171. 171 : *
  172. 172 : * See also [Creating a Corpus Tokenization](tutorial-corpuscreator.html#processing).
  173. 173 : *
  174. 174 : * @property {String} xmlContentXpath The XPath expression that defines the location of document content (the body); only used for XML-based documents.
  175. 175 : *
  176. 176 : * loadCorpus("<doc><head>Hello world!</head><body>This is Voyant!</body></doc>", {
  177. 177 : * xmlContentXpath: "//body"
  178. 178 : * }); // document would be: "This is Voyant!"
  179. 179 : *
  180. 180 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  181. 181 : *
  182. 182 : * @property {String} xmlTitleXpath The XPath expression that defines the location of each document's title; only used for XML-based documents.
  183. 183 : *
  184. 184 : * loadCorpus("<doc><title>Hello world!</title><body>This is Voyant!</body></doc>", {
  185. 185 : * xmlTitleXpath: "//title"
  186. 186 : * }); // title would be: "Hello world!"
  187. 187 : *
  188. 188 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  189. 189 : *
  190. 190 : * @property {String} xmlAuthorXpath The XPath expression that defines the location of each document's author; only used for XML-based documents.
  191. 191 : *
  192. 192 : * loadCorpus("<doc><author>Stéfan Sinclair</author><body>This is Voyant!</body></doc>", {
  193. 193 : * xmlAuthorXpath: "//author"
  194. 194 : * }); // author would be: "Stéfan Sinclair"
  195. 195 : *
  196. 196 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  197. 197 : *
  198. 198 : * @property {String} xmlPubPlaceXpath The XPath expression that defines the location of each document's publication place; only used for XML-based documents.
  199. 199 : *
  200. 200 : * loadCorpus("<doc><pubPlace>Montreal</pubPlace><body>This is Voyant!</body></doc>", {
  201. 201 : * xmlPubPlaceXpath: "//pubPlace"
  202. 202 : * }); // publication place would be: "Montreal"
  203. 203 : *
  204. 204 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  205. 205 : *
  206. 206 : * @property {String} xmlPublisherXpath The XPath expression that defines the location of each document's publisher; only used for XML-based documents.
  207. 207 : *
  208. 208 : * loadCorpus("<doc><publisher>The Owl</publisher><body>This is Voyant!</body></doc>", {
  209. 209 : * xmlPublisherXpath: "//publisher"
  210. 210 : * }); // publisher would be: "The Owl"
  211. 211 : *
  212. 212 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  213. 213 : *
  214. 214 : * @property {String} xmlKeywordXpath The XPath expression that defines the location of each document's keywords; only used for XML-based documents.
  215. 215 : *
  216. 216 : * loadCorpus("<doc><keyword>text analysis</keyword><body>This is Voyant!</body></doc>", {
  217. 217 : * xmlKeywordXpath: "//keyword"
  218. 218 : * }); // publisher would be: "text analysis"
  219. 219 : *
  220. 220 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  221. 221 : *
  222. 222 : * @property {String} xmlCollectionXpath The XPath expression that defines the location of each document's collection name; only used for XML-based documents.
  223. 223 : *
  224. 224 : * loadCorpus("<doc><collection>documentation</collection><body>This is Voyant!</body></doc>", {
  225. 225 : * xmlCollectionXpath: "//collection"
  226. 226 : * }); // publisher would be: "documentation"
  227. 227 : *
  228. 228 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  229. 229 : *
  230. 230 : * @property {String} xmlDocumentsXpath The XPath expression that defines the location of each document; only used for XML-based documents.
  231. 231 : *
  232. 232 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  233. 233 : *
  234. 234 : * @property {String} xmlGroupByXpath The XPath expression by which to group multiple documents; only used for XML-based documents.
  235. 235 : *
  236. 236 : * loadCorpus("<doc><sp s='Juliet'>Hello!</sp><sp s='Romeo'>Hi!</sp><sp s='Juliet'>Bye!</sp></doc>", {
  237. 237 : * xmlDocumentsXpath: '//sp',
  238. 238 : * xmlGroupByXpath: "//@s"
  239. 239 : * }); // two docs: "Hello! Bye!" (Juliet) and "Hi!" (Romeo)
  240. 240 : *
  241. 241 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  242. 242 : *
  243. 243 : * @property {String} xmlExtraMetadataXpath A value that defines the location of other metadata; only used for XML-based documents.
  244. 244 : *
  245. 245 : * loadCorpus("<doc><tool>Voyant</tool><phase>1</phase><body>This is Voyant!</body></doc>", {
  246. 246 : * xmlExtraMetadataXpath: "tool=//tool\nphase=//phase"
  247. 247 : * }); // tool would be "Voyant" and phase would be "1"
  248. 248 : *
  249. 249 : * Note that `xmlExtraMetadataXpath` is a bit different from the other XPath expressions in that it's
  250. 250 : * possible to define multiple values (each on its own line) in the form of name=xpath.
  251. 251 : *
  252. 252 : * See also [Creating a Corpus with XML](tutorial-corpuscreator.html#xml).
  253. 253 : *
  254. 254 : * @property {String} xmlExtractorTemplate Pass the XML document through the XSL template located at the specified URL before extraction (this is ignored in XML-based documents).
  255. 255 : *
  256. 256 : * This is an advanced parameter that allows you to define a URL of an XSL template that can
  257. 257 : * be called *before* text extraction (in other words, the other XML-based parameters apply
  258. 258 : * after this template has been processed).
  259. 259 : *
  260. 260 : * @property {String} inputRemoveUntil Omit text up until the start of the matching regular expression (this is ignored in XML-based documents).
  261. 261 : *
  262. 262 : * loadCorpus("Hello world! This is Voyant!", {
  263. 263 : * inputRemoveUntil: "This"
  264. 264 : * }); // document would be: "This is Voyant!"
  265. 265 : *
  266. 266 : * See also [Creating a Corpus with Text](tutorial-corpuscreator.html#text).
  267. 267 : *
  268. 268 : * @property {String} inputRemoveUntilAfter Omit text up until the end of the matching regular expression (this is ignored in XML-based documents).
  269. 269 : *
  270. 270 : * loadCorpus("Hello world! This is Voyant!", {
  271. 271 : * inputRemoveUntilAfter: "world!"
  272. 272 : * }); // document would be: "This is Voyant!"
  273. 273 : *
  274. 274 : * See also [Creating a Corpus with Text](tutorial-corpuscreator.html#text).
  275. 275 : *
  276. 276 : * @property {String} inputRemoveFrom Omit text from the start of the matching regular expression (this is ignored in XML-based documents).
  277. 277 : *
  278. 278 : * loadCorpus("Hello world! This is Voyant!", {
  279. 279 : * inputRemoveFrom: "This"
  280. 280 : * }); // document would be: "Hello World!"
  281. 281 : *
  282. 282 : * See also [Creating a Corpus with Text](tutorial-corpuscreator.html#text).
  283. 283 : *
  284. 284 : * @property {String} inputRemoveFromAfter Omit text from the end of the matching regular expression (this is ignored in XML-based documents).
  285. 285 : *
  286. 286 : * loadCorpus("Hello world! This is Voyant!", {
  287. 287 : * inputRemoveFromAfter: "This"
  288. 288 : * }); // document would be: "Hello World! This"
  289. 289 : *
  290. 290 : * See also [Creating a Corpus with Text](tutorial-corpuscreator.html#text).
  291. 291 : *
  292. 292 : * @property {String} subTitle A sub-title for the corpus.
  293. 293 : *
  294. 294 : * This is currently not used, except in the Dynamic Table of Contexts skin. Still, it may be worth specifying a subtitle for later use.
  295. 295 : *
  296. 296 : * @property {String} title A title for the corpus.
  297. 297 : *
  298. 298 : * This is currently not used, except in the Dynamic Table of Contexts skin. Still, it may be worth specifying a title for later use.
  299. 299 : *
  300. 300 : * @property {String} curatorTsv a simple TSV of paths and labels for the DToC interface (this isn't typically used outside of the specialized DToC context).
  301. 301 : *
  302. 302 : * The DToC skin allows curation of XML tags and attributes in order to constrain the entries shown in the interface or to provide friendlier labels. This assumes plain text unicode input with one definition per line where the simple XPath expression is separated by a tab from a label.
  303. 303 : *
  304. 304 : * p paragraph
  305. 305 : * ref[@target*="religion"] religion
  306. 306 : *
  307. 307 : * For more information see the DToC documentation on [Curating Tags]{@link http://cwrc.ca/Documentation/public/index.html#DITA_Files-Various_Applications/DToC/CuratingTags.html}
  308. 308 : */
  309. 309 :
  310. 310 : /**
  311. 311 : * Create a new Corpus using the specified Corpus ID
  312. 312 : * @constructor
  313. 313 : * @param {string} id The Corpus ID
  314. 314 : */
  315. 315 : constructor(id) {
  316. 316 : this.corpusid = id;
  317. 317 : }
  318. 318 :
  319. 319 : static Load = Load;
  320. 320 :
  321. 321 : static setBaseUrl(baseUrl) {
  322. 322 : Load.setBaseUrl(baseUrl);
  323. 323 : }
  324. 324 :
  325. 325 : /**
  326. 326 : * Returns the ID of the corpus.
  327. 327 : *
  328. 328 : * @returns {Promise<string>} a Promise for the string ID of the corpus
  329. 329 : */
  330. 330 : id() {
  331. 331 : let me = this;
  332. 332 : return new Promise(resolve => resolve(me.corpusid));
  333. 333 : }
  334. 334 :
  335. 335 : /*
  336. 336 : * Create a Corpus and return the ID
  337. 337 : * @param {Object} config
  338. 338 : * @param {Object} api
  339. 339 : */
  340. 340 : // static id(config, api) {
  341. 341 : // return Corpus.load(config).then(corpus => corpus.id(api || config));
  342. 342 : // }
  343. 343 :
  344. 344 : /**
  345. 345 : * Returns the metadata object (of the corpus or document, depending on which mode is used).
  346. 346 : *
  347. 347 : * The following is an example of the object return for the metadata of the Jane Austen corpus:
  348. 348 : *
  349. 349 : * {
  350. 350 : * "id": "b50407fd1cbbecec4315a8fc411bad3c",
  351. 351 : * "alias": "austen",
  352. 352 : * "title": "",
  353. 353 : * "subTitle": "",
  354. 354 : * "documentsCount": 8,
  355. 355 : * "createdTime": 1582429585984,
  356. 356 : * "createdDate": "2020-02-22T22:46:25.984-0500",
  357. 357 : * "lexicalTokensCount": 781763,
  358. 358 : * "lexicalTypesCount": 15368,
  359. 359 : * "noPasswordAccess": "NORMAL",
  360. 360 : * "languageCodes": [
  361. 361 : * "en"
  362. 362 : * ]
  363. 363 : * }
  364. 364 : *
  365. 365 : * The following is an example of what is returned as metadata for the first document:
  366. 366 : *
  367. 367 : * [
  368. 368 : * {
  369. 369 : * "id": "ddac6b12c3f4261013c63d04e8d21b45",
  370. 370 : * "extra.X-Parsed-By": "org.apache.tika.parser.DefaultParser",
  371. 371 : * "tokensCount-lexical": "33559",
  372. 372 : * "lastTokenStartOffset-lexical": "259750",
  373. 373 : * "parent_modified": "1548457455000",
  374. 374 : * "typesCount-lexical": "4235",
  375. 375 : * "typesCountMean-lexical": "7.924203",
  376. 376 : * "lastTokenPositionIndex-lexical": "33558",
  377. 377 : * "index": "0",
  378. 378 : * "language": "en",
  379. 379 : * "sentencesCount": "1302",
  380. 380 : * "source": "stream",
  381. 381 : * "typesCountStdDev-lexical": "46.626404",
  382. 382 : * "title": "1790 Love And Freindship",
  383. 383 : * "parent_queryParameters": "VOYANT_BUILD=M16&textarea-1015-inputEl=Type+in+one+or+more+URLs+on+separate+lines+or+paste+in+a+full+text.&VOYANT_REMOTE_ID=199.229.249.196&accessIP=199.229.249.196&VOYANT_VERSION=2.4&palette=default&suppressTools=false",
  384. 384 : * "extra.Content-Type": "text/plain; charset=windows-1252",
  385. 385 : * "parentType": "expansion",
  386. 386 : * "extra.Content-Encoding": "windows-1252",
  387. 387 : * "parent_source": "file",
  388. 388 : * "parent_id": "ae47e3a72cd3cad51e196e8a41e21aec",
  389. 389 : * "modified": "1432861756000",
  390. 390 : * "location": "1790 Love And Freindship.txt",
  391. 391 : * "parent_title": "Austen",
  392. 392 : * "parent_location": "Austen.zip"
  393. 393 : * }
  394. 394 : * ]
  395. 395 : *
  396. 396 : * In Corpus mode there's no reason to specify arguments. In documents mode you
  397. 397 : * can request specific documents in the config object:
  398. 398 : *
  399. 399 : * * **start**: the zero-based start of the list
  400. 400 : * * **limit**: a limit to the number of items to return at a time
  401. 401 : * * **docIndex**: a zero-based list of documents (first document is zero, etc.); multiple documents can be separated by a comma
  402. 402 : * * **docId**: a set of document IDs; multiple documents can be separated by a comma
  403. 403 : * * **query**: one or more term queries for the title, author or full-text
  404. 404 : * * **sort**: one of the following sort orders: `INDEX`, `TITLE`, `AUTHOR`, `TOKENSCOUNTLEXICAL`, `TYPESCOUNTLEXICAL`, `TYPETOKENRATIOLEXICAL`, `PUBDATE`
  405. 405 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  406. 406 : *
  407. 407 : * An example:
  408. 408 : *
  409. 409 : * // this would show the number 8 (the size of the corpus)
  410. 410 : * loadCorpus("austen").metadata().then(metadata => metadata.documentsCount)
  411. 411 : *
  412. 412 : * @param {Object} config an Object specifying parameters (see list above)
  413. 413 : * @returns {Promise<object>} a Promise for an Object containing metadata
  414. 414 : */
  415. 415 : metadata(config) {
  416. 416 : return Load.trombone(config, {
  417. 417 : tool: isDocumentsMode(config) ? 'corpus.DocumentsMetadata' : 'corpus.CorpusMetadata',
  418. 418 : corpus: this.corpusid
  419. 419 : })
  420. 420 : .then(data => isDocumentsMode(config) ? data.documentsMetadata.documents : data.corpus.metadata);
  421. 421 : }
  422. 422 :
  423. 423 : /*
  424. 424 : * Create a Corpus and return the metadata
  425. 425 : * @param {*} config
  426. 426 : * @param {*} api
  427. 427 : */
  428. 428 : // static metadata(config, api) {
  429. 429 : // return Corpus.load(config).then(corpus => corpus.metadata(api || config));
  430. 430 : // }
  431. 431 :
  432. 432 : /**
  433. 433 : * Returns a brief summary of the corpus that includes essential metadata (documents count, terms count, etc.)
  434. 434 : *
  435. 435 : * An example:
  436. 436 : *
  437. 437 : * loadCorpus("austen").summary();
  438. 438 : *
  439. 439 : * @returns {Promise<string>} a Promise for a string containing a brief summary of the corpus metadata
  440. 440 : */
  441. 441 : summary() {
  442. 442 : return this.metadata().then(data => {
  443. 443 : return `This corpus (${data.alias ? data.alias : data.id}) has ${data.documentsCount.toLocaleString()} documents with ${data.lexicalTokensCount.toLocaleString()} total words and ${data.lexicalTypesCount.toLocaleString()} unique word forms.`;
  444. 444 : });
  445. 445 : }
  446. 446 :
  447. 447 : /*
  448. 448 : * Create a Corpus and return the summary
  449. 449 : * @param {*} config
  450. 450 : * @param {*} api
  451. 451 : */
  452. 452 : // static summary(config, api) {
  453. 453 : // return Corpus.load(config).then(corpus => corpus.summary(api || config));
  454. 454 : // }
  455. 455 :
  456. 456 : /**
  457. 457 : * Returns an array of document titles for the corpus.
  458. 458 : *
  459. 459 : * The following are valid in the config parameter:
  460. 460 : *
  461. 461 : * * **start**: the zero-based start of the list
  462. 462 : * * **limit**: a limit to the number of items to return at a time
  463. 463 : * * **docIndex**: a zero-based list of documents (first document is zero, etc.); multiple documents can be separated by a comma
  464. 464 : * * **docId**: a set of document IDs; multiple documents can be separated by a comma
  465. 465 : * * **query**: one or more term queries for the title, author or full-text
  466. 466 : * * **sort**: one of the following sort orders: `INDEX`, `TITLE`, `AUTHOR`, `TOKENSCOUNTLEXICAL`, `TYPESCOUNTLEXICAL`, `TYPETOKENRATIOLEXICAL`, `PUBDATE`
  467. 467 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  468. 468 : *
  469. 469 : * An example:
  470. 470 : *
  471. 471 : * loadCorpus("austen").titles().then(titles => "The last work is: "+titles[titles.length-1])
  472. 472 : *
  473. 473 : * @param {Object} config an Object specifying parameters (see list above)
  474. 474 : * @param {number} config.start the zero-based start of the list
  475. 475 : * @param {number} config.limit a limit to the number of items to return at a time
  476. 476 : * @param {number} config.docIndex a zero-based list of documents (first document is zero, etc.); multiple documents can be separated by a comma
  477. 477 : * @param {string} config.docId a set of document IDs; multiple documents can be separated by a comma
  478. 478 : * @param {string} config.query one or more term queries for the title, author or full-text
  479. 479 : * @param {string} config.sort one of the following sort orders: `INDEX`, `TITLE`, `AUTHOR`, `TOKENSCOUNTLEXICAL`, `TYPESCOUNTLEXICAL`, `TYPETOKENRATIOLEXICAL`, `PUBDATE`
  480. 480 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  481. 481 : * @returns {Promise<Array>} a Promise for an Array of document titles
  482. 482 : */
  483. 483 : titles(config={}) {
  484. 484 : config.mode = 'documents';
  485. 485 : return this.metadata(config).then(data => data.map(doc => doc.title));
  486. 486 : }
  487. 487 :
  488. 488 : /**
  489. 489 : * Returns an array of documents metadata for the corpus.
  490. 490 : *
  491. 491 : * The following are valid in the config parameter:
  492. 492 : *
  493. 493 : * * **start**: the zero-based start of the list
  494. 494 : * * **limit**: a limit to the number of items to return at a time
  495. 495 : * * **docIndex**: a zero-based list of documents (first document is zero, etc.); multiple documents can be separated by a comma
  496. 496 : * * **docId**: a set of document IDs; multiple documents can be separated by a comma
  497. 497 : * * **query**: one or more term queries for the title, author or full-text
  498. 498 : * * **sort**: one of the following sort orders: `INDEX`, `TITLE`, `AUTHOR`, `TOKENSCOUNTLEXICAL`, `TYPESCOUNTLEXICAL`, `TYPETOKENRATIOLEXICAL`, `PUBDATE`
  499. 499 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  500. 500 : *
  501. 501 : * @param {Object} config an Object specifying parameters (see list above)
  502. 502 : * @param {number} config.start the zero-based start of the list
  503. 503 : * @param {number} config.limit a limit to the number of items to return at a time
  504. 504 : * @param {number} config.docIndex a zero-based list of documents (first document is zero, etc.); multiple documents can be separated by a comma
  505. 505 : * @param {string} config.docId a set of document IDs; multiple documents can be separated by a comma
  506. 506 : * @param {string} config.query one or more term queries for the title, author or full-text
  507. 507 : * @param {string} config.sort one of the following sort orders: `INDEX`, `TITLE`, `AUTHOR`, `TOKENSCOUNTLEXICAL`, `TYPESCOUNTLEXICAL`, `TYPETOKENRATIOLEXICAL`, `PUBDATE`
  508. 508 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  509. 509 : * @returns {Promise<Array>} a Promise for an Array of documents metadata
  510. 510 : */
  511. 511 : documents(config={}) {
  512. 512 : config.mode = 'documents';
  513. 513 : return this.metadata(config);
  514. 514 : }
  515. 515 :
  516. 516 : /*
  517. 517 : * Create a Corpus and return the titles
  518. 518 : * @param {*} config
  519. 519 : * @param {*} api
  520. 520 : */
  521. 521 : // static titles(config, api) {
  522. 522 : // return Corpus.load(config).then(corpus => corpus.titles(api || config));
  523. 523 : // }
  524. 524 :
  525. 525 : /**
  526. 526 : * Returns the text of the entire corpus.
  527. 527 : *
  528. 528 : * Texts are concatenated together with two new lines and three dashes (\\n\\n---\\n\\n)
  529. 529 : *
  530. 530 : * The following are valid in the config parameter:
  531. 531 : *
  532. 532 : * * **noMarkup**: strips away the markup
  533. 533 : * * **compactSpace**: strips away superfluous spaces and multiple new lines
  534. 534 : * * **limit**: a limit to the number of characters (per text)
  535. 535 : * * **format**: `text` for plain text, any other value for the simplified Voyant markup
  536. 536 : *
  537. 537 : * An example:
  538. 538 : *
  539. 539 : * // fetch 1000 characters from each text in the corpus into a single string
  540. 540 : * loadCorpus("austen").text({limit:1000})
  541. 541 : *
  542. 542 : * @param {Object} config an Object specifying parameters (see list above)
  543. 543 : * @param {boolean} config.noMarkup strips away the markup
  544. 544 : * @param {boolean} config.compactSpace strips away superfluous spaces and multiple new lines
  545. 545 : * @param {number} config.limit a limit to the number of characters (per text)
  546. 546 : * @param {string} config.format `text` for plain text, any other value for the simplified Voyant markup
  547. 547 : * @returns {Promise<string>} a Promise for a string of the corpus
  548. 548 : */
  549. 549 : text(config) {
  550. 550 : return this.texts(config).then(data => data.join('\n\n---\n\n'));
  551. 551 : }
  552. 552 :
  553. 553 : /*
  554. 554 : * Create a Corpus and return the text
  555. 555 : * @param {*} config
  556. 556 : * @param {*} api
  557. 557 : */
  558. 558 : // static text(config, api) {
  559. 559 : // return Corpus.load(config).then(corpus => corpus.text(api || config));
  560. 560 : // }
  561. 561 :
  562. 562 : /**
  563. 563 : * Returns an array of texts from the entire corpus.
  564. 564 : *
  565. 565 : * The following are valid in the config parameter:
  566. 566 : *
  567. 567 : * * **noMarkup**: strips away the markup
  568. 568 : * * **compactSpace**: strips away superfluous spaces and multiple new lines
  569. 569 : * * **limit**: a limit to the number of characters (per text)
  570. 570 : * * **format**: `text` for plain text, any other value for the simplified Voyant markup
  571. 571 : *
  572. 572 : * An example:
  573. 573 : *
  574. 574 : * // fetch 1000 characters from each text in the corpus into an Array
  575. 575 : * loadCorpus("austen").texts({limit:1000})
  576. 576 : *
  577. 577 : * @param {Object} config an Object specifying parameters (see list above)
  578. 578 : * @param {boolean} config.noMarkup strips away the markup
  579. 579 : * @param {boolean} config.compactSpace strips away superfluous spaces and multiple new lines
  580. 580 : * @param {number} config.limit a limit to the number of characters (per text)
  581. 581 : * @param {string} config.format `text` for plain text, any other value for the simplified Voyant markup
  582. 582 : * @returns {Promise<Array>} a Promise for an Array of texts from the corpus
  583. 583 : */
  584. 584 : texts(config) {
  585. 585 : return Load.trombone(config, {
  586. 586 : tool: 'corpus.CorpusTexts',
  587. 587 : corpus: this.corpusid
  588. 588 : }).then(data => data.texts.texts);
  589. 589 : }
  590. 590 :
  591. 591 : /*
  592. 592 : * Create a Corpus and return the texts
  593. 593 : * @param {*} config
  594. 594 : * @param {*} api
  595. 595 : */
  596. 596 : // static texts(config, api) {
  597. 597 : // return Corpus.load(config).then(corpus => corpus.texts(api || config));
  598. 598 : // }
  599. 599 :
  600. 600 : /**
  601. 601 : * Returns an array of terms (either CorpusTerms or DocumentTerms, depending on the specified mode).
  602. 602 : * These terms are actually types, so information about each type is collected (as opposed to the [tokens]{@link Spyral.Corpus#tokens}
  603. 603 : * method which is for every occurrence in document order).
  604. 604 : *
  605. 605 : * The mode is set to "documents" when any of the following is true
  606. 606 : *
  607. 607 : * * the `mode` parameter is set to "documents"
  608. 608 : * * a `docIndex` parameter being set
  609. 609 : * * a `docId` parameter being set
  610. 610 : *
  611. 611 : * The following is an example a Corpus Term (corpus mode):
  612. 612 : *
  613. 613 : * {
  614. 614 : * "term": "the",
  615. 615 : * "inDocumentsCount": 8,
  616. 616 : * "rawFreq": 28292,
  617. 617 : * "relativeFreq": 0.036189996,
  618. 618 : * "comparisonRelativeFreqDifference": 0
  619. 619 : * }
  620. 620 : *
  621. 621 : * The following is an example of Document Term (documents mode):
  622. 622 : *
  623. 623 : * {
  624. 624 : * "term": "the",
  625. 625 : * "rawFreq": 1333,
  626. 626 : * "relativeFreq": 39721.086,
  627. 627 : * "zscore": 28.419,
  628. 628 : * "zscoreRatio": -373.4891,
  629. 629 : * "tfidf": 0.0,
  630. 630 : * "totalTermsCount": 33559,
  631. 631 : * "docIndex": 0,
  632. 632 : * "docId": "8a61d5d851a69c03c6ba9cc446713574"
  633. 633 : * }
  634. 634 : *
  635. 635 : * The following config parameters are valid in both modes:
  636. 636 : *
  637. 637 : * * **start**: the zero-based start index of the list (for paging)
  638. 638 : * * **limit**: the maximum number of terms to provide per request
  639. 639 : * * **minRawFreq**: the minimum raw frequency of terms
  640. 640 : * * **query**: a term query (see [search tutorial]{@tutorial search})
  641. 641 : * * **stopList**: a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  642. 642 : * * **withDistributions**: a true value shows distribution across the corpus (corpus mode) or across the document (documents mode)
  643. 643 : * * **whiteList**: a keyword list – terms will be limited to this list
  644. 644 : * * **tokenType**: the token type to use, by default `lexical` (other possible values might be `title` and `author`)
  645. 645 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  646. 646 : *
  647. 647 : * The following are specific to corpus mode:
  648. 648 : *
  649. 649 : * * **bins**: by default there are the same number of bins as there are documents (for distribution values), this can be modified
  650. 650 : * * **corpusComparison**: you can provide the ID of a corpus for comparison of frequency values
  651. 651 : * * **inDocumentsCountOnly**: if you don't need term frequencies but only frequency per document set this to true
  652. 652 : * * **sort**: the order of the terms, one of the following: `INDOCUMENTSCOUNT, RAWFREQ, TERM, RELATIVEPEAKEDNESS, RELATIVESKEWNESS, COMPARISONRELATIVEFREQDIFFERENCE`
  653. 653 : *
  654. 654 : * The following are specific to documents mode:
  655. 655 : *
  656. 656 : * * **bins**: by default the document is divided into 10 equal bins(for distribution values), this can be modified
  657. 657 : * * **sort**: the order of the terms, one of the following: `RAWFREQ, RELATIVEFREQ, TERM, TFIDF, ZSCORE`
  658. 658 : * * **perDocLimit**: the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  659. 659 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  660. 660 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  661. 661 : *
  662. 662 : * An example:
  663. 663 : *
  664. 664 : * // show top 5 terms
  665. 665 : * loadCorpus("austen").terms({stopList: 'auto', limit: 5}).then(terms => terms.map(term => term.term))
  666. 666 : *
  667. 667 : * // show top term for each document
  668. 668 : * loadCorpus("austen").terms({stopList: 'auto', perDocLimit: 1, mode: 'documents'}).then(terms => terms.map(term => term.term))
  669. 669 : *
  670. 670 : * @param {Object} config an Object specifying parameters (see list above)
  671. 671 : * @param {number} config.start the zero-based start index of the list (for paging)
  672. 672 : * @param {number} config.limit the maximum number of terms to provide per request
  673. 673 : * @param {number} config.minRawFreq the minimum raw frequency of terms
  674. 674 : * @param {string} config.query a term query (see [search tutorial]{@tutorial search})
  675. 675 : * @param {string} config.stopList a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  676. 676 : * @param {boolean} config.withDistributions a true value shows distribution across the corpus (corpus mode) or across the document (documents mode)
  677. 677 : * @param {string} config.whiteList a keyword list – terms will be limited to this list
  678. 678 : * @param {string} config.tokenType the token type to use, by default `lexical` (other possible values might be `title` and `author`)
  679. 679 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  680. 680 : * @returns {Promise<Array>} a Promise for a Array of Terms
  681. 681 : */
  682. 682 : terms(config) {
  683. 683 : return Load.trombone(config, {
  684. 684 : tool: isDocumentsMode(config) ? 'corpus.DocumentTerms' : 'corpus.CorpusTerms',
  685. 685 : corpus: this.corpusid
  686. 686 : }).then(data => isDocumentsMode(config) ? data.documentTerms.terms : data.corpusTerms.terms);
  687. 687 : }
  688. 688 :
  689. 689 : /*
  690. 690 : * Create a Corpus and return the terms
  691. 691 : * @param {*} config
  692. 692 : * @param {*} api
  693. 693 : */
  694. 694 : // static terms(config, api) {
  695. 695 : // return Corpus.load(config).then(corpus => corpus.terms(api || config));
  696. 696 : // }
  697. 697 :
  698. 698 : /**
  699. 699 : * Returns an array of document tokens.
  700. 700 : *
  701. 701 : * The promise returns an array of document token objects. A document token object can look something like this:
  702. 702 : *
  703. 703 : * {
  704. 704 : * "docId": "8a61d5d851a69c03c6ba9cc446713574",
  705. 705 : * "docIndex": 0,
  706. 706 : * "term": "LOVE",
  707. 707 : * "tokenType": "lexical",
  708. 708 : * "rawFreq": 54,
  709. 709 : * "position": 0,
  710. 710 : * "startOffset": 3,
  711. 711 : * "endOffset": 7
  712. 712 : * }
  713. 713 : *
  714. 714 : * The following are valid in the config parameter:
  715. 715 : *
  716. 716 : * * **start**: the zero-based start index of the list (for paging)
  717. 717 : * * **limit**: the maximum number of terms to provide per request
  718. 718 : * * **stopList**: a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  719. 719 : * * **whiteList**: a keyword list – terms will be limited to this list
  720. 720 : * * **perDocLimit**: the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  721. 721 : * * **noOthers**: only include lexical forms, no other tokens
  722. 722 : * * **stripTags**: one of the following: `ALL`, `BLOCKSONLY`, `NONE` (`BLOCKSONLY` tries to maintain blocks for line formatting)
  723. 723 : * * **withPosLemmas**: include part-of-speech and lemma information when available (reliability of this may vary by instance)
  724. 724 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  725. 725 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  726. 726 : *
  727. 727 : * An example:
  728. 728 : *
  729. 729 : * // load the first 20 tokens (don't include tags, spaces, etc.)
  730. 730 : * loadCorpus("austen").tokens({limit: 20, noOthers: true})
  731. 731 : *
  732. 732 : * @param {Object} config an Object specifying parameters (see above)
  733. 733 : * @param {number} config.start the zero-based start index of the list (for paging)
  734. 734 : * @param {number} config.limit the maximum number of terms to provide per request
  735. 735 : * @param {string} config.stopList a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  736. 736 : * @param {string} config.whiteList a keyword list – terms will be limited to this list
  737. 737 : * @param {number} config.perDocLimit the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  738. 738 : * @param {boolean} config.noOthers only include lexical forms, no other tokens
  739. 739 : * @param {string} config.stripTags one of the following: `ALL`, `BLOCKSONLY`, `NONE` (`BLOCKSONLY` tries to maintain blocks for line formatting)
  740. 740 : * @param {boolean} config.withPosLemmas include part-of-speech and lemma information when available (reliability of this may vary by instance)
  741. 741 : * @param {number} config.docIndex the zero-based index of the documents to include (use commas to separate multiple values)
  742. 742 : * @param {string} config.docId the document IDs to include (use commas to separate multiple values)
  743. 743 : * @returns {Promise<Array>} a Promise for an Array of document tokens
  744. 744 : */
  745. 745 : tokens(config) {
  746. 746 : return Load.trombone(config, {
  747. 747 : tool: 'corpus.DocumentTokens',
  748. 748 : corpus: this.corpusid
  749. 749 : }).then(data => data.documentTokens.tokens);
  750. 750 : }
  751. 751 :
  752. 752 : /*
  753. 753 : * Create a Corpus and return the tokens
  754. 754 : * @param {*} config
  755. 755 : * @param {*} api
  756. 756 : */
  757. 757 : // static tokens(config, api) {
  758. 758 : // return Corpus.load(config).then(corpus => corpus.tokens(api || config));
  759. 759 : // }
  760. 760 :
  761. 761 : /**
  762. 762 : * Returns an array of words from the corpus.
  763. 763 : *
  764. 764 : * The array of words are in document order, much like tokens.
  765. 765 : *
  766. 766 : * The following are valid in the config parameter:
  767. 767 : *
  768. 768 : * * **start**: the zero-based start index of the list (for paging)
  769. 769 : * * **limit**: the maximum number of terms to provide per request
  770. 770 : * * **stopList**: a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  771. 771 : * * **whiteList**: a keyword list – terms will be limited to this list
  772. 772 : * * **perDocLimit**: the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  773. 773 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  774. 774 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  775. 775 : *
  776. 776 : * An example:
  777. 777 : *
  778. 778 : * // load the first 20 words in the corpus
  779. 779 : * loadCorpus("austen").tokens({limit: 20})
  780. 780 : *
  781. 781 : * @param {Object} config an Object specifying parameters (see above)
  782. 782 : * @param {number} config.start the zero-based start index of the list (for paging)
  783. 783 : * @param {number} config.limit the maximum number of terms to provide per request
  784. 784 : * @param {string} config.stopList a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  785. 785 : * @param {string} config.whiteList a keyword list – terms will be limited to this list
  786. 786 : * @param {number} config.perDocLimit the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  787. 787 : * @param {number} config.docIndex the zero-based index of the documents to include (use commas to separate multiple values)
  788. 788 : * @param {string} config.docId the document IDs to include (use commas to separate multiple values)
  789. 789 : * @returns {Promise<Array>} a Promise for an Array of words
  790. 790 : */
  791. 791 : words(config = {}) {
  792. 792 : // by default DocumentTokens limits to 50 which probably isn't expected
  793. 793 : if (!('limit' in config)) {config.limit=0;}
  794. 794 : return Load.trombone(config, {
  795. 795 : tool: 'corpus.DocumentTokens',
  796. 796 : noOthers: true,
  797. 797 : corpus: this.corpusid
  798. 798 : }).then(data => data.documentTokens.tokens.map(t => t.term));
  799. 799 : }
  800. 800 :
  801. 801 : /*
  802. 802 : * Create a Corpus and return an array of lexical forms (words) in document order.
  803. 803 : * @param {Object} config
  804. 804 : * @param {Object} api
  805. 805 : */
  806. 806 : // static words(config, api) {
  807. 807 : // return Corpus.load(config).then(corpus => corpus.words(api || config));
  808. 808 : // }
  809. 809 :
  810. 810 : /**
  811. 811 : * Returns an array of Objects that contain keywords in contexts (KWICs).
  812. 812 : *
  813. 813 : * An individual KWIC Object looks something like this:
  814. 814 : *
  815. 815 : * {
  816. 816 : * "docIndex": 0,
  817. 817 : * "query": "love",
  818. 818 : * "term": "love",
  819. 819 : * "position": 0,
  820. 820 : * "left": "FREINDSHIP AND OTHER EARLY WORKS",
  821. 821 : * "middle": "Love",
  822. 822 : * "right": " And Friendship And Other Early"
  823. 823 : * }
  824. 824 : *
  825. 825 : * The following are valid in the config parameter:
  826. 826 : *
  827. 827 : * * **start**: the zero-based start index of the list (for paging)
  828. 828 : * * **limit**: the maximum number of terms to provide per request
  829. 829 : * * **query**: a term query (see [search tutorial]{@tutorial search})
  830. 830 : * * **sort**: the order of the contexts: `TERM, DOCINDEX, POSITION, LEFT, RIGHT`
  831. 831 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  832. 832 : * * **perDocLimit**: the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  833. 833 : * * **stripTags**: for the `left`, `middle` and `right` values, one of the following: `ALL`, `BLOCKSONLY` (tries to maintain blocks for line formatting), `NONE` (default)
  834. 834 : * * **context**: the size of the context (the number of words on each side of the keyword)
  835. 835 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  836. 836 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  837. 837 : * * **overlapStrategy**: determines how to handle cases where there's overlap between KWICs, such as "to be or not to be" when the keyword is "be"; here are the options:
  838. 838 : * * **none**: nevermind the overlap, keep all words
  839. 839 : * * {left: "to", middle: "be", right: "or not to be"}
  840. 840 : * * {left: "to be or not to", middle: "be", right: ""}
  841. 841 : * * **first**: priority goes to the first occurrence (some may be dropped)
  842. 842 : * * {left: "to", middle: "be", right: "or not to be"}
  843. 843 : * * **merge**: balance the words between overlapping occurrences
  844. 844 : * * {left: "to", middle: "be", right: "or"}
  845. 845 : * * {left: "not to", middle: "be", right: ""}
  846. 846 : *
  847. 847 : * An example:
  848. 848 : *
  849. 849 : * // load the first 20 words in the corpus
  850. 850 : * loadCorpus("austen").contexts({query: "love", limit: 10})
  851. 851 : *
  852. 852 : * @param {Object} config an Object specifying parameters (see above)
  853. 853 : * @param {number} config.start the zero-based start index of the list (for paging)
  854. 854 : * @param {number} config.limit the maximum number of terms to provide per request
  855. 855 : * @param {string} config.query a term query (see [search tutorial]{@tutorial search})
  856. 856 : * @param {string} config.sort the order of the contexts: `TERM, DOCINDEX, POSITION, LEFT, RIGHT`
  857. 857 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  858. 858 : * @param {number} config.perDocLimit the `limit` parameter is for the total number of terms returned, this parameter allows you to specify a limit value per document
  859. 859 : * @param {string} config.stripTags for the `left`, `middle` and `right` values, one of the following: `ALL`, `BLOCKSONLY` (tries to maintain blocks for line formatting), `NONE` (default)
  860. 860 : * @param {number} config.context the size of the context (the number of words on each side of the keyword)
  861. 861 : * @param {number} config.docIndex the zero-based index of the documents to include (use commas to separate multiple values)
  862. 862 : * @param {string} config.docId the document IDs to include (use commas to separate multiple values)
  863. 863 : * @param {string} config.overlapStrategy determines how to handle cases where there's overlap between KWICs, such as "to be or not to be" when the keyword is "be"
  864. 864 : * @returns {Promise<Array>} a Promise for an Array of KWIC Objects
  865. 865 : */
  866. 866 : contexts(config) {
  867. 867 : if ((!config || !config.query) && console) {console.warn('No query provided for contexts request.');}
  868. 868 : return Load.trombone(config, {
  869. 869 : tool: 'corpus.DocumentContexts',
  870. 870 : corpus: this.corpusid
  871. 871 : }).then(data => data.documentContexts.contexts);
  872. 872 : }
  873. 873 :
  874. 874 : /*
  875. 875 : * Create a Corpus and return the contexts
  876. 876 : * @param {Object} config
  877. 877 : * @param {Object} api
  878. 878 : */
  879. 879 : // static contexts(config, api) {
  880. 880 : // return Corpus.load(config).then(corpus => corpus.contexts(api || config));
  881. 881 : // }
  882. 882 :
  883. 883 : /**
  884. 884 : * Returns an array of collocates (either document or corpus collocates, depending on the specified mode).
  885. 885 : *
  886. 886 : * The mode is set to "documents" when any of the following is true
  887. 887 : *
  888. 888 : * * the `mode` parameter is set to "documents"
  889. 889 : * * a `docIndex` parameter being set
  890. 890 : * * a `docId` parameter being set
  891. 891 : *
  892. 892 : * The following is an example a Corpus Collocate (corpus mode):
  893. 893 : *
  894. 894 : * {
  895. 895 : * "term": "love",
  896. 896 : * "rawFreq": 568,
  897. 897 : * "contextTerm": "mr",
  898. 898 : * "contextTermRawFreq": 24
  899. 899 : * }
  900. 900 : *
  901. 901 : * The following is an example of Document Collocate (documents mode):
  902. 902 : *
  903. 903 : * {
  904. 904 : * "docIndex": 4,
  905. 905 : * "keyword": "love",
  906. 906 : * "keywordContextRawFrequency": 124,
  907. 907 : * "term": "fanny",
  908. 908 : * "termContextRawFrequency": 8,
  909. 909 : * "termContextRelativeFrequency": 0.021680217,
  910. 910 : * "termDocumentRawFrequency": 816,
  911. 911 : * "termDocumentRelativeFrequency": 0.0050853477,
  912. 912 : * "termContextDocumentRelativeFrequencyDifference": 0.01659487
  913. 913 : * }
  914. 914 : *
  915. 915 : * The following config parameters are valid in both modes:
  916. 916 : *
  917. 917 : * * **start**: the zero-based start index of the list (for paging)
  918. 918 : * * **limit**: the maximum number of terms to provide per request
  919. 919 : * * **query**: a term query (see [search tutorial]{@tutorial search})
  920. 920 : * * **stopList**: a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  921. 921 : * * **collocatesWhitelist**: collocates will be limited to this list
  922. 922 : * * **context**: the size of the context (the number of words on each side of the keyword)
  923. 923 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  924. 924 : *
  925. 925 : * The following are specific to corpus mode:
  926. 926 : *
  927. 927 : * * **sort**: the order of the terms, one of the following: `RAWFREQ, TERM, CONTEXTTERM, CONTEXTTERMRAWFREQ`
  928. 928 : *
  929. 929 : * The following are specific to documents mode:
  930. 930 : *
  931. 931 : * * **sort**: the order of the terms, one of the following: `TERM, REL, REL, RAW, DOCREL, DOCRAW, CONTEXTDOCRELDIFF`
  932. 932 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  933. 933 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  934. 934 : *
  935. 935 : * An example:
  936. 936 : *
  937. 937 : * // show top 5 collocate terms
  938. 938 : * loadCorpus("austen").collocates({stopList: 'auto', limit: 5}).then(terms => terms.map(term => term.term))
  939. 939 : *
  940. 940 : * @param {Object} config an Object specifying parameters (see list above)
  941. 941 : * @param {number} config.start the zero-based start index of the list (for paging)
  942. 942 : * @param {number} config.limit the maximum number of terms to provide per request
  943. 943 : * @param {string} config.query a term query (see [search tutorial]{@tutorial search})
  944. 944 : * @param {string} config.stopList a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  945. 945 : * @param {string} config.collocatesWhitelist collocates will be limited to this list
  946. 946 : * @param {number} config.context the size of the context (the number of words on each side of the keyword)
  947. 947 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  948. 948 : * @returns {Promise<Array>} a Promise for a Array of Terms
  949. 949 : */
  950. 950 : collocates(config) {
  951. 951 : if ((!config || !config.query) && console) {console.warn('No query provided for collocates request.');}
  952. 952 : return Load.trombone(config, {
  953. 953 : tool: 'corpus.CorpusCollocates',
  954. 954 : corpus: this.corpusid
  955. 955 : }).then(data => data.corpusCollocates.collocates);
  956. 956 : }
  957. 957 :
  958. 958 : /*
  959. 959 : * Create a Corpus and return the collocates
  960. 960 : * @param {Object} config
  961. 961 : * @param {Object} api
  962. 962 : */
  963. 963 : // static collocates(config, api) {
  964. 964 : // return Corpus.load(config).then(corpus => corpus.collocates(api || config));
  965. 965 : // }
  966. 966 :
  967. 967 : /**
  968. 968 : * Returns an array of phrases or n-grams (either document or corpus phrases, depending on the specified mode).
  969. 969 : *
  970. 970 : * The mode is set to "documents" when any of the following is true
  971. 971 : *
  972. 972 : * * the `mode` parameter is set to "documents"
  973. 973 : * * a `docIndex` parameter being set
  974. 974 : * * a `docId` parameter being set
  975. 975 : *
  976. 976 : * The following is an example a Corpus phrase (corpus mode), without distributions requested:
  977. 977 : *
  978. 978 : * {
  979. 979 : * "term": "love with",
  980. 980 : * "rawFreq": 103,
  981. 981 : * "length": 2
  982. 982 : * }
  983. 983 : *
  984. 984 : * The following is an example of Document phrase (documents mode), without positions requested:
  985. 985 : *
  986. 986 : * {
  987. 987 : * "term": "love with",
  988. 988 : * "rawFreq": 31,
  989. 989 : * "length": 2,
  990. 990 : * "docIndex": 5
  991. 991 : * }
  992. 992 : *
  993. 993 : * The following config parameters are valid in both modes:
  994. 994 : *
  995. 995 : * * **start**: the zero-based start index of the list (for paging)
  996. 996 : * * **limit**: the maximum number of terms to provide per request
  997. 997 : * * **minLength**: the minimum length of the phrase
  998. 998 : * * **maxLength**: the maximum length of the phrase
  999. 999 : * * **minRawFreq**: the minimum raw frequency of the phrase
  1000. 1000 : * * **sort**: the order of the terms, one of the following: `RAWFREQ, TERM, LENGTH`
  1001. 1001 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  1002. 1002 : * * **overlapFilter**: it happens that phrases contain other phrases and we need a strategy for handling overlap:
  1003. 1003 : * * **NONE**: nevermind the overlap, keep all phrases
  1004. 1004 : * * **LENGTHFIRST**: priority goes to the longest phrases
  1005. 1005 : * * **RAWFREQFIRST**: priority goes to the highest frequency phrases
  1006. 1006 : * * **POSITIONFIRST**: priority goes to the first phrases
  1007. 1007 : *
  1008. 1008 : * The following are specific to documents mode:
  1009. 1009 : *
  1010. 1010 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  1011. 1011 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  1012. 1012 : *
  1013. 1013 : * An example:
  1014. 1014 : *
  1015. 1015 : * // load the first 20 phrases in the corpus
  1016. 1016 : * loadCorpus("austen").phrases({query: "love", limit: 10})
  1017. 1017 : *
  1018. 1018 : * @param {Object} config an Object specifying parameters (see above)
  1019. 1019 : * @param {number} config.start the zero-based start index of the list (for paging)
  1020. 1020 : * @param {number} config.limit the maximum number of terms to provide per request
  1021. 1021 : * @param {number} config.minLength the minimum length of the phrase
  1022. 1022 : * @param {number} config.maxLength the maximum length of the phrase
  1023. 1023 : * @param {number} config.minRawFreq the minimum raw frequency of the phrase
  1024. 1024 : * @param {string} config.sort the order of the terms, one of the following: `RAWFREQ, TERM, LENGTH`
  1025. 1025 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  1026. 1026 : * @param {string} config.overlapFilter it happens that phrases contain other phrases and we need a strategy for handling overlap
  1027. 1027 : * @returns {Promise<Array>} a Promise for an Array of phrase Objects
  1028. 1028 : */
  1029. 1029 : phrases(config) {
  1030. 1030 : return Load.trombone(config, {
  1031. 1031 : tool: isDocumentsMode(config) ? 'corpus.DocumentNgrams' : 'corpus.CorpusNgrams',
  1032. 1032 : corpus: this.corpusid
  1033. 1033 : }).then(data => isDocumentsMode(config) ? data.documentNgrams.ngrams : data.corpusNgrams.ngrams);
  1034. 1034 : }
  1035. 1035 :
  1036. 1036 : /*
  1037. 1037 : * Create a Corpus and return the phrases
  1038. 1038 : * @param {Object} config
  1039. 1039 : * @param {Object} api
  1040. 1040 : */
  1041. 1041 : // static phrases(config, api) {
  1042. 1042 : // return Corpus.load(config).then(corpus => corpus.phrases(api || config));
  1043. 1043 : // }
  1044. 1044 :
  1045. 1045 : /**
  1046. 1046 : * Returns an array of correlations (either document or corpus correlations, depending on the specified mode).
  1047. 1047 : *
  1048. 1048 : * The mode is set to "documents" when any of the following is true
  1049. 1049 : *
  1050. 1050 : * * the `mode` parameter is set to "documents"
  1051. 1051 : * * a `docIndex` parameter being set
  1052. 1052 : * * a `docId` parameter being set
  1053. 1053 : *
  1054. 1054 : * The following is an example a Corpus correlation (corpus mode):
  1055. 1055 : *
  1056. 1056 : * {
  1057. 1057 : * "source": {
  1058. 1058 : * "term": "mrs",
  1059. 1059 : * "inDocumentsCount": 8,
  1060. 1060 : * "rawFreq": 2531,
  1061. 1061 : * "relativePeakedness": 0.46444246,
  1062. 1062 : * "relativeSkewness": -0.44197384
  1063. 1063 : * },
  1064. 1064 : * "target": {
  1065. 1065 : * "term": "love",
  1066. 1066 : * "inDocumentsCount": 8,
  1067. 1067 : * "rawFreq": 568,
  1068. 1068 : * "relativePeakedness": 5.763066,
  1069. 1069 : * "relativeSkewness": 2.2536576
  1070. 1070 : * },
  1071. 1071 : * "correlation": -0.44287738,
  1072. 1072 : * "significance": 0.08580014
  1073. 1073 : * }
  1074. 1074 : *
  1075. 1075 : * The following is an example of Document correlation (documents mode), without positions requested:
  1076. 1076 : *
  1077. 1077 : * {
  1078. 1078 : * "source": {
  1079. 1079 : * "term": "confide",
  1080. 1080 : * "rawFreq": 3,
  1081. 1081 : * "relativeFreq": 89.3948,
  1082. 1082 : * "zscore": -0.10560975,
  1083. 1083 : * "zscoreRatio": -0.7541012,
  1084. 1084 : * "tfidf": 1.1168874E-5,
  1085. 1085 : * "totalTermsCount": 33559,
  1086. 1086 : * "docIndex": 0,
  1087. 1087 : * "docId": "8a61d5d851a69c03c6ba9cc446713574"
  1088. 1088 : * },
  1089. 1089 : * "target": {
  1090. 1090 : * "term": "love",
  1091. 1091 : * "rawFreq": 54,
  1092. 1092 : * "relativeFreq": 1609.1063,
  1093. 1093 : * "zscore": 53.830048,
  1094. 1094 : * "zscoreRatio": -707.44696,
  1095. 1095 : * "tfidf": 0.0,
  1096. 1096 : * "totalTermsCount": 33559,
  1097. 1097 : * "docIndex": 0,
  1098. 1098 : * "docId": "8a61d5d851a69c03c6ba9cc446713574"
  1099. 1099 : * },
  1100. 1100 : * "correlation": 0.93527687,
  1101. 1101 : * "significance": 7.0970666E-5
  1102. 1102 : * }
  1103. 1103 : *
  1104. 1104 : * The following config parameters are valid in both modes:
  1105. 1105 : *
  1106. 1106 : * * **start**: the zero-based start index of the list (for paging)
  1107. 1107 : * * **limit**: the maximum number of terms to provide per request
  1108. 1108 : * * **termsOnly**: a very compact data view of the correlations
  1109. 1109 : * * **sort**: the order of the terms, one of the following: `CORRELATION`, `CORRELATIONABS`
  1110. 1110 : * * **dir**: sort direction, **`ASC`**ending or **`DESC`**ending
  1111. 1111 : *
  1112. 1112 : * The following is specific to corpus mode:
  1113. 1113 : *
  1114. 1114 : * * **minInDocumentsCountRatio**: the minimum coverage (as a percentage between 0 and 100) of the term, amongst all the documents
  1115. 1115 : *
  1116. 1116 : * The following are specific to documents mode:
  1117. 1117 : *
  1118. 1118 : * * **docIndex**: the zero-based index of the documents to include (use commas to separate multiple values)
  1119. 1119 : * * **docId**: the document IDs to include (use commas to separate multiple values)
  1120. 1120 : *
  1121. 1121 : * An example:
  1122. 1122 : *
  1123. 1123 : * // load the first 10 phrases in the corpus
  1124. 1124 : * loadCorpus("austen").correlations({query: "love", limit: 10})
  1125. 1125 : *
  1126. 1126 : * @param {Object} config an Object specifying parameters (see above)
  1127. 1127 : * @param {number} config.start the zero-based start index of the list (for paging)
  1128. 1128 : * @param {number} config.limit the maximum number of terms to provide per request
  1129. 1129 : * @param {number} config.minInDocumentsCountRatio the minimum coverage (as a percentage between 0 and 100) of the term, amongst all the documents
  1130. 1130 : * @param {boolean} config.termsOnly a very compact data view of the correlations
  1131. 1131 : * @param {string} config.sort the order of the terms, one of the following: `CORRELATION`, `CORRELATIONABS`
  1132. 1132 : * @param {string} config.dir sort direction, **`ASC`**ending or **`DESC`**ending
  1133. 1133 : * @returns {Promise<Array>} a Promise for an Array of phrase Objects
  1134. 1134 : */
  1135. 1135 : correlations(config) {
  1136. 1136 : if ((!config || !config.query) && console) {
  1137. 1137 : console.warn('No query provided for correlations request.');
  1138. 1138 : if (!isDocumentsMode(config)) {
  1139. 1139 : throw new Error('Unable to run correlations for a corpus without a query.');
  1140. 1140 : }
  1141. 1141 : }
  1142. 1142 : return Load.trombone(config, {
  1143. 1143 : tool: isDocumentsMode(config) ? 'corpus.DocumentTermCorrelations' : 'corpus.CorpusTermCorrelations',
  1144. 1144 : corpus: this.corpusid
  1145. 1145 : }).then(data => data.termCorrelations.correlations);
  1146. 1146 : }
  1147. 1147 :
  1148. 1148 : /*
  1149. 1149 : * Create a Corpus and return the correlations
  1150. 1150 : * @param {Object} config
  1151. 1151 : * @param {Object} api
  1152. 1152 : */
  1153. 1153 : // static correlations(config, api) {
  1154. 1154 : // return Corpus.load(config).then(corpus => corpus.correlations(api || config));
  1155. 1155 : // }
  1156. 1156 :
  1157. 1157 : /**
  1158. 1158 : * Get lemmas. This is the equivalent of calling: this.tokens({ withPosLemmas: true, noOthers: true })
  1159. 1159 : * @param {Object} config an Object specifying parameters (see above)
  1160. 1160 : * @returns {Promise<Array>} a Promise for an Array of lemma Objects
  1161. 1161 : */
  1162. 1162 : lemmas(config={}) {
  1163. 1163 : config.withPosLemmas = true;
  1164. 1164 : config.noOthers = true;
  1165. 1165 : return this.tokens(config);
  1166. 1166 : }
  1167. 1167 :
  1168. 1168 : /**
  1169. 1169 : * Performs topic modelling using the latent Dirichlet allocation. Returns an object that has two primary properties:
  1170. 1170 : *
  1171. 1171 : * * **topics**: an array of topics (words organized into bunches of a specified size)
  1172. 1172 : * * **topicDocuments**: an array of documents and their topic weights
  1173. 1173 : *
  1174. 1174 : * Each topic in the **topics** array is an object with the following properties:
  1175. 1175 : *
  1176. 1176 : * * **words**: an array of the actual words that form the topic. Each word has the same properties as the topic, as well as a "word" property that contains the text content.
  1177. 1177 : * * tokens
  1178. 1178 : * * documentEntropy
  1179. 1179 : * * wordLength
  1180. 1180 : * * coherence
  1181. 1181 : * * uniformDist
  1182. 1182 : * * corpusDist
  1183. 1183 : * * effNumWords
  1184. 1184 : * * tokenDocDiff
  1185. 1185 : * * rank1Docs
  1186. 1186 : * * allocationRatio
  1187. 1187 : * * allocationCount
  1188. 1188 : * * exclusivity
  1189. 1189 : *
  1190. 1190 : * Each document in the **topicDocuments** array is an object with the following properties:
  1191. 1191 : *
  1192. 1192 : * * docId: the document ID
  1193. 1193 : * * weights: an array of the numbers corresponding to the the weight of each topic in this document
  1194. 1194 : *
  1195. 1195 : * The config object as parameter can contain the following:
  1196. 1196 : *
  1197. 1197 : * * **topics**: the number of topics to get (default is 10)
  1198. 1198 : * * **termsPerTopic**: the number of terms for each topic (default is 10)
  1199. 1199 : * * **iterations**: the number of iterations to do, more iterations = more accurate (default is 100)
  1200. 1200 : * * **perDocLimit**: the token limit per document, starting at the beginning of the document
  1201. 1201 : * * **seed**: specify a particular seed to use for random number generation
  1202. 1202 : * * **stopList**: a list of stopwords to include
  1203. 1203 : *
  1204. 1204 : * @param {Object} config (see above)
  1205. 1205 : * @param {number} config.topics the number of topics to get (default is 10)
  1206. 1206 : * @param {number} config.termsPerTopic the number of terms for each topic (default is 10)
  1207. 1207 : * @param {number} config.iterations the number of iterations to do, more iterations = more accurate (default is 100)
  1208. 1208 : * @param {number} config.perDocLimit specify a token limit per document, starting at the beginning of the document
  1209. 1209 : * @param {number} config.seed specify a particular seed to use for random number generation
  1210. 1210 : * @param {string} config.stopList a list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  1211. 1211 : * @returns {Promise<Object>}
  1212. 1212 : */
  1213. 1213 : async topics(config = {topics: 10, termsPerTopic: 10, iterations: 100, seed: 0, stopList: 'auto'}) {
  1214. 1214 : return Load.trombone(config, {
  1215. 1215 : tool: 'analysis.TopicModeling',
  1216. 1216 : corpus: this.corpusid
  1217. 1217 : }).then(data => data.topicModeling);
  1218. 1218 : }
  1219. 1219 :
  1220. 1220 : /**
  1221. 1221 : * Returns an array of entities.
  1222. 1222 : *
  1223. 1223 : * The config object as parameter can contain the following:
  1224. 1224 : *
  1225. 1225 : * * **docIndex**: document index to restrict to (can be comma-separated list)
  1226. 1226 : * * **annotator**: the annotator to use: 'stanford' or 'nssi' or 'spacy'
  1227. 1227 : *
  1228. 1228 : * @param {Object} config
  1229. 1229 : * @param {(number|string)} config.docIndex document index to restrict to (can be comma-separated list)
  1230. 1230 : * @param {string} config.annotator the annotator to use: 'stanford' or 'nssi' or 'spacy'
  1231. 1231 : * @returns {Promise<Array>}
  1232. 1232 : */
  1233. 1233 : entities(config = {annotator: 'stanford'}) {
  1234. 1234 : const timeoutDelay = 5000;
  1235. 1235 : const corpusId = this.corpusid;
  1236. 1236 : return new Promise((resolve, reject) => {
  1237. 1237 : function doLoad(config) {
  1238. 1238 : Load.trombone(config, {
  1239. 1239 : tool: 'corpus.DocumentEntities',
  1240. 1240 : includeEntities: true,
  1241. 1241 : noCache: true, // never cache, we don't want stale entity status
  1242. 1242 : corpus: corpusId
  1243. 1243 : }).then(data => {
  1244. 1244 : const total = data.documentEntities.status.length;
  1245. 1245 : let numDone = 0;
  1246. 1246 : let hasFailures = false;
  1247. 1247 : data.documentEntities.status.forEach(function(item) {
  1248. 1248 : if (item[1] === 'done') numDone++;
  1249. 1249 : else if (item[1].indexOf('failed') === 0) {
  1250. 1250 : numDone++;
  1251. 1251 : hasFailures = true;
  1252. 1252 : }
  1253. 1253 : });
  1254. 1254 : const isDone = numDone === total;
  1255. 1255 :
  1256. 1256 : if (isDone) {
  1257. 1257 : if (hasFailures && numDone === 1) {
  1258. 1258 : reject('Failed to get entities');
  1259. 1259 : } else {
  1260. 1260 : resolve(data.documentEntities.entities);
  1261. 1261 : }
  1262. 1262 : } else {
  1263. 1263 : delete config.retryFailures;
  1264. 1264 : setTimeout(doLoad.bind(this, config), timeoutDelay);
  1265. 1265 : }
  1266. 1266 :
  1267. 1267 : }, (error) => reject(error));
  1268. 1268 : }
  1269. 1269 :
  1270. 1270 : doLoad(config);
  1271. 1271 : });
  1272. 1272 : }
  1273. 1273 :
  1274. 1274 : /**
  1275. 1275 : * Given a Categories instance or ID, returns an object mapping category names to corpus terms. The results can be limited to specific category names by providing one or more of them.
  1276. 1276 : * @param {String|Spyral.Categories} categories A categories ID or a Spyral.Categories instance.
  1277. 1277 : * @param {String|Array<String>} [categoryName] One or more names of categories within the instance.
  1278. 1278 : * @returns {Promise<Object>}
  1279. 1279 : */
  1280. 1280 : async filterByCategory(categories, categoryName) {
  1281. 1281 : if (categories === undefined) return;
  1282. 1282 :
  1283. 1283 : if (categories instanceof Categories === false) {
  1284. 1284 : categories = await Categories.load(categories);
  1285. 1285 : }
  1286. 1286 :
  1287. 1287 : let categoryNames = [];
  1288. 1288 :
  1289. 1289 : // TODO make sure categoryName is a valid key for categories
  1290. 1290 : if (categoryName === undefined) {
  1291. 1291 : categoryNames = categories.getCategoryNames();
  1292. 1292 : } else if (Util.isString(categoryName)) {
  1293. 1293 : categoryNames = [categoryName];
  1294. 1294 : } else {
  1295. 1295 : categoryNames = categoryName;
  1296. 1296 : }
  1297. 1297 :
  1298. 1298 : const termsResults = await Promise.all(
  1299. 1299 : categoryNames.map(key => {
  1300. 1300 : let catTerms = categories.getCategoryTerms(key);
  1301. 1301 : return this.terms({whiteList: catTerms});
  1302. 1302 : })
  1303. 1303 : );
  1304. 1304 :
  1305. 1305 : let results = {};
  1306. 1306 : termsResults.forEach((terms, i) => {
  1307. 1307 : results[categoryNames[i]] = terms;
  1308. 1308 : });
  1309. 1309 :
  1310. 1310 : return results;
  1311. 1311 : }
  1312. 1312 :
  1313. 1313 : /**
  1314. 1314 : * Performs one of several dimension reduction statistical analysis techniques.
  1315. 1315 : *
  1316. 1316 : * For more details see the [scatterplot tutorial]{@tutorial scatterplot}.
  1317. 1317 : *
  1318. 1318 : * @param {Object} config
  1319. 1319 : * @param {string} config.type The type of analysis technique to use: 'ca', 'pca', 'tsne', 'docsim'
  1320. 1320 : * @param {number} config.start The zero-based start of the list
  1321. 1321 : * @param {number} config.limit A limit to the number of items to return at a time
  1322. 1322 : * @param {number} config.dimensions The number of dimensions to render, either 2 or 3.
  1323. 1323 : * @param {number} config.bins The number of bins to separate a document into.
  1324. 1324 : * @param {number} config.clusters The number of clusters within which to group words.
  1325. 1325 : * @param {number} config.perplexity The TSNE perplexity value.
  1326. 1326 : * @param {number} config.iterations The TSNE iterations value.
  1327. 1327 : * @param {string} config.comparisonType The value to use for comparing terms. Options are: 'raw', 'relative', and 'tfidf'.
  1328. 1328 : * @param {string} config.target The term to set as the target. This will filter results to terms that are near the target.
  1329. 1329 : * @param {string} config.term Used in combination with "target" as a white list of terms to keep.
  1330. 1330 : * @param {string} config.query A term query (see [search tutorial]{@tutorial search})
  1331. 1331 : * @param {string} config.stopList A list of stopwords to include (see [stopwords tutorial]{@tutorial stopwords})
  1332. 1332 : * @returns {Promise<Object>}
  1333. 1333 : */
  1334. 1334 : analysis(config = {}) {
  1335. 1335 : config = Object.assign({
  1336. 1336 : type: 'ca', start: 0, limit: 50, dimensions: 3, bins: 10, clusters: 3, perplexity: 15, iterations: 1500, comparisonType: 'relative', target: undefined, term: undefined, query: undefined, stopList: 'auto'
  1337. 1337 : }, config);
  1338. 1338 : const analysis = config.type.toLowerCase();
  1339. 1339 : delete config.type;
  1340. 1340 : let tool = '';
  1341. 1341 : let root = '';
  1342. 1342 : if (analysis === 'tsne') {
  1343. 1343 : tool = 'corpus.TSNE';
  1344. 1344 : root = 'tsneAnalysis';
  1345. 1345 : } else if (analysis === 'pca') {
  1346. 1346 : tool = 'corpus.PCA';
  1347. 1347 : root = 'pcaAnalysis';
  1348. 1348 : } else if (analysis === 'docsim') {
  1349. 1349 : tool = 'corpus.DocumentSimilarity';
  1350. 1350 : root = 'documentSimilarity';
  1351. 1351 : } else {
  1352. 1352 : tool = 'corpus.CA';
  1353. 1353 : root = 'correspondenceAnalysis';
  1354. 1354 : }
  1355. 1355 : return Load.trombone(config, {
  1356. 1356 : tool,
  1357. 1357 : withDistributions: true,
  1358. 1358 : corpus: this.corpusid
  1359. 1359 : }).then(data => data[root]);
  1360. 1360 : }
  1361. 1361 :
  1362. 1362 : /**
  1363. 1363 : * Returns an HTML snippet that will produce the specified Voyant tools to appear.
  1364. 1364 : *
  1365. 1365 : * In its simplest form we can simply call the named tool:
  1366. 1366 : *
  1367. 1367 : * loadCorpus("austen").tool("Cirrus");
  1368. 1368 : *
  1369. 1369 : * Each tool supports some options (that are summarized below), and those can be specified as options:
  1370. 1370 : *
  1371. 1371 : * loadCorpus("austen").tool("Trends", {query: "love"});
  1372. 1372 : *
  1373. 1373 : * There are also parameters (width, height, style, float) that apply to the actual tool window:
  1374. 1374 : *
  1375. 1375 : * loadCorpus("austen").tool("Trends", {query: "love", style: "width: 500px; height: 500px"});
  1376. 1376 : *
  1377. 1377 : * It's also possible to have several tools appear at once, though they won't be connected by events (clicking in a window won't modify the other windows):
  1378. 1378 : *
  1379. 1379 : * loadCorpus("austen").tool("Cirrus", "Trends");
  1380. 1380 : *
  1381. 1381 : * One easy way to get connected tools is to use the `CustomSet` tool and experiment with the layout:
  1382. 1382 : *
  1383. 1383 : * loadCorpus("austen").tool("CustomSet", {tableLayout: "Cirrus,Trends", style: "width:800px; height: 500px"});
  1384. 1384 : *
  1385. 1385 : * See [the list of corpus tools]{@link Tools} for available tools and options.
  1386. 1386 : *
  1387. 1387 : * @param {string} tool The tool to display
  1388. 1388 : * @param {Object} config The config object for the tool
  1389. 1389 : * @returns {Promise<string>}
  1390. 1390 : */
  1391. 1391 : tool(_tool, config = {}) {
  1392. 1392 : let me = this;
  1393. 1393 : return new Promise((resolve, reject) => {
  1394. 1394 : let isTool = function(obj) {return obj && (typeof obj==='string' && /\W/.test(obj)===false) || (typeof obj === 'object' && 'forTool' in obj);};
  1395. 1395 : let isConfig = function(obj) {return obj && typeof obj === 'object' && !('forTool' in obj);};
  1396. 1396 : let lastArg = arguments[arguments.length-1];
  1397. 1397 : config = isConfig(lastArg) ? lastArg : {};
  1398. 1398 :
  1399. 1399 : // we have all tools and we'll show them individually
  1400. 1400 : if (isTool(_tool) && (isTool(lastArg) || isConfig(lastArg))) {
  1401. 1401 : let defaultAttributes = {
  1402. 1402 : style: ''
  1403. 1403 : };
  1404. 1404 : let out = '';
  1405. 1405 : for (let i=0; i<arguments.length; i++) {
  1406. 1406 : let t = arguments[i];
  1407. 1407 : if (isTool(t)) {
  1408. 1408 : if (typeof t === 'string') {t = {forTool: t};} // make sure we have object
  1409. 1409 :
  1410. 1410 : // process width and height info
  1411. 1411 : var width = config['width'] !== undefined ? config['width']+'' : '350';
  1412. 1412 : var height = config['height'] !== undefined ? config['height']+'' : '350';
  1413. 1413 : if (width.search(/^\d+$/) === 0) width += 'px';
  1414. 1414 : if (height.search(/^\d+$/) === 0) height += 'px';
  1415. 1415 : if (config['style'] !== undefined) {
  1416. 1416 : if (config['style'].indexOf('width') === -1) {
  1417. 1417 : config['style'] = `width: ${width};` + config['style'];
  1418. 1418 : }
  1419. 1419 : if (config['style'].indexOf('height') === -1) {
  1420. 1420 : config['style'] = `height: ${height};` + config['style'];
  1421. 1421 : }
  1422. 1422 : } else {
  1423. 1423 : config['style'] = `width: ${width}; height: ${height};`;
  1424. 1424 : }
  1425. 1425 :
  1426. 1426 : // build iframe tag
  1427. 1427 : out+='<iframe ';
  1428. 1428 : for (let attr in defaultAttributes) {
  1429. 1429 : var val = (attr in t ? t[attr] : undefined) || (attr in config ? config[attr] : undefined) || (attr in defaultAttributes ? defaultAttributes[attr] : undefined);
  1430. 1430 : if (val!==undefined) {
  1431. 1431 : out+=' '+attr+'="'+val+'"';
  1432. 1432 : }
  1433. 1433 : }
  1434. 1434 :
  1435. 1435 : // build url
  1436. 1436 : var url = new URL((config && config.voyantUrl ? config.voyantUrl : Load.baseUrl) + 'tool/'+t.forTool+'/');
  1437. 1437 : url.searchParams.append('corpus', me.corpusid);
  1438. 1438 : // add API values from config (some may be ignored)
  1439. 1439 : let all = Object.assign(t,config);
  1440. 1440 : Object.keys(all).forEach(key => {
  1441. 1441 : if (key !=='input' && !(key in defaultAttributes)) {
  1442. 1442 : let value = all[key];
  1443. 1443 : // TODO need to sort this out, if key is "query" and value is an array then stringify will break the query format for voyant
  1444. 1444 : // if (typeof value !== 'string') {
  1445. 1445 : // value = JSON.stringify(value);
  1446. 1446 : // }
  1447. 1447 : url.searchParams.append(key, value);
  1448. 1448 : }
  1449. 1449 : });
  1450. 1450 :
  1451. 1451 : // finish tag
  1452. 1452 : out+=' src="'+url+'"></iframe>';
  1453. 1453 : }
  1454. 1454 : }
  1455. 1455 : return resolve(out);
  1456. 1456 : } else {
  1457. 1457 : if (Array.isArray(_tool)) {
  1458. 1458 : _tool = _tool.join(';');
  1459. 1459 : }
  1460. 1460 :
  1461. 1461 : let defaultAttributes = {
  1462. 1462 : width: undefined,
  1463. 1463 : height: undefined,
  1464. 1464 : style: 'width: 90%; height: '+(350*(_tool ? _tool : '').split(';').length)+'px'
  1465. 1465 : };
  1466. 1466 :
  1467. 1467 : // build iframe tag
  1468. 1468 : let out ='<iframe ';
  1469. 1469 : for (let attr in defaultAttributes) {
  1470. 1470 : var val = (attr in config ? config[attr] : undefined) || (attr in defaultAttributes ? defaultAttributes[attr] : undefined);
  1471. 1471 : if (val!==undefined) {
  1472. 1472 : out+=' '+attr+'="'+val+'"';
  1473. 1473 : }
  1474. 1474 : }
  1475. 1475 :
  1476. 1476 : // build url
  1477. 1477 : var url = new URL((config && config.voyantUrl ? config.voyantUrl : Load.baseUrl)+(_tool ? ('?view=customset&tableLayout='+_tool) : ''));
  1478. 1478 : url.searchParams.append('corpus', me.corpusid);
  1479. 1479 : // add API values from config (some may be ignored)
  1480. 1480 : Object.keys(config).forEach(key => {
  1481. 1481 : if (key !=='input' && !(key in defaultAttributes)) {
  1482. 1482 : let value = config[key];
  1483. 1483 : // if (typeof value !== 'string') {
  1484. 1484 : // value = JSON.stringify(value);
  1485. 1485 : // }
  1486. 1486 : url.searchParams.append(key, value);
  1487. 1487 : }
  1488. 1488 : });
  1489. 1489 : resolve(out+' src=\''+url+'\'></iframe>');
  1490. 1490 : }
  1491. 1491 : });
  1492. 1492 : }
  1493. 1493 :
  1494. 1494 : /*
  1495. 1495 : * Create a Corpus and return the tool
  1496. 1496 : * @param {*} tool
  1497. 1497 : * @param {*} config
  1498. 1498 : * @param {*} api
  1499. 1499 : */
  1500. 1500 : // static tool(tool, config, api) {
  1501. 1501 : // return Corpus.load(config).then(corpus => corpus.tool(tool, config, api));
  1502. 1502 : // }
  1503. 1503 :
  1504. 1504 : /**
  1505. 1505 : * An alias for [summary]{@link Spyral.Corpus#summary}.
  1506. 1506 : */
  1507. 1507 : toString() {
  1508. 1508 : return this.summary();
  1509. 1509 : }
  1510. 1510 :
  1511. 1511 : /*
  1512. 1512 : * Create a new Corpus using the provided config
  1513. 1513 : * @param {Object} config
  1514. 1514 : */
  1515. 1515 : // static create(config) {
  1516. 1516 : // return Corpus.load(config);
  1517. 1517 : // }
  1518. 1518 :
  1519. 1519 : /**
  1520. 1520 : * Load a Corpus using the provided config and api
  1521. 1521 : * @param {Spyral.Corpus~CorpusConfig} config the Corpus config
  1522. 1522 : * @param {Object} api any additional API values
  1523. 1523 : * @returns {Promise<Corpus>}
  1524. 1524 : * @static
  1525. 1525 : */
  1526. 1526 : static load(config={}, api = {}) {
  1527. 1527 : const promise = new Promise(function(resolve, reject) {
  1528. 1528 :
  1529. 1529 : if (config instanceof Corpus) {
  1530. 1530 : resolve(config);
  1531. 1531 : }
  1532. 1532 :
  1533. 1533 : if (Util.isString(config)) {
  1534. 1534 : if (config.length>0 && /\W/.test(config)===false) {
  1535. 1535 : config = {corpus: config};
  1536. 1536 : } else {
  1537. 1537 : config = {input: config};
  1538. 1538 : }
  1539. 1539 : } else if (Util.isArray(config) && config.length > 0 && typeof config[0] === 'string') {
  1540. 1540 : config = {input: config};
  1541. 1541 : } else if (Util.isBlob(config) || Util.isNode(config) || (Util.isArray(config) && (Util.isBlob(config[0]) || Util.isNode(config[0])))) {
  1542. 1542 : const formData = new FormData();
  1543. 1543 : if (Util.isArray(config)) {
  1544. 1544 : config.forEach(file => {
  1545. 1545 : if (Util.isNode(file)) {
  1546. 1546 : const nodeString = new XMLSerializer().serializeToString(file);
  1547. 1547 : file = new Blob([nodeString], {type: 'text/xml'});
  1548. 1548 : }
  1549. 1549 : formData.append('input', file);
  1550. 1550 : const fileExt = Util.getFileExtensionFromMimeType(file.type);
  1551. 1551 : formData.append('inputFormat', Util.getVoyantDocumentFormatFromFileExtension(fileExt));
  1552. 1552 : });
  1553. 1553 : } else {
  1554. 1554 : if (Util.isNode(config)) {
  1555. 1555 : const nodeString = new XMLSerializer().serializeToString(config);
  1556. 1556 : config = new Blob([nodeString], {type: 'text/xml'});
  1557. 1557 : }
  1558. 1558 : formData.set('input', config);
  1559. 1559 : const fileExt = Util.getFileExtensionFromMimeType(config.type);
  1560. 1560 : formData.set('inputFormat', Util.getVoyantDocumentFormatFromFileExtension(fileExt));
  1561. 1561 : }
  1562. 1562 :
  1563. 1563 : // append any other form options that may have been included
  1564. 1564 : if (api && Util.isObject(api)) {
  1565. 1565 : for (let key in api) {
  1566. 1566 : formData.set(key, api[key]);
  1567. 1567 : }
  1568. 1568 : }
  1569. 1569 :
  1570. 1570 : formData.set('tool', 'corpus.CorpusMetadata');
  1571. 1571 :
  1572. 1572 : config = {
  1573. 1573 : body: formData,
  1574. 1574 : method: 'POST'
  1575. 1575 : };
  1576. 1576 : } else if (Util.isObject(config)) {
  1577. 1577 : if (config.inputFormat === 'json' && Util.isString(config.input) === false) {
  1578. 1578 : config.input = JSON.stringify(config.input);
  1579. 1579 : }
  1580. 1580 : }
  1581. 1581 :
  1582. 1582 : Load.trombone({...config,...api}, {tool: 'corpus.CorpusMetadata'})
  1583. 1583 : .then((data) => {
  1584. 1584 : resolve(new Corpus(data.corpus.metadata.id));
  1585. 1585 : }, (err) => {
  1586. 1586 : reject(err);
  1587. 1587 : });
  1588. 1588 :
  1589. 1589 : });
  1590. 1590 :
  1591. 1591 : ['analysis','collocates','contexts','correlations','documents','entities','id','topics','lemmas','metadata','phrases','summary','terms','text','texts','titles','toString','tokens','tool','words'].forEach(name => {
  1592. 1592 : promise[name] = function() {
  1593. 1593 : var args = arguments;
  1594. 1594 : return promise.then(corpus => {return corpus[name].apply(corpus, args);});
  1595. 1595 : };
  1596. 1596 : });
  1597. 1597 :
  1598. 1598 : // TODO document assign
  1599. 1599 : promise.assign = function(name) {
  1600. 1600 : return this.then(corpus => {window[name] = corpus; return corpus;});
  1601. 1601 : };
  1602. 1602 :
  1603. 1603 : return promise;
  1604. 1604 : }
  1605. 1605 : }
  1606. 1606 :
  1607. 1607 : export default Corpus;