001/* 002 * Copyright © 2012, 2013, 2014 Royal Botanic Gardens, Kew. 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 005 * 006 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 007 * 008 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 009 */ 010package org.kew.rmf.transformers.collations.wcs; 011 012import java.io.BufferedReader; 013import java.io.BufferedWriter; 014import java.io.FileInputStream; 015import java.io.FileWriter; 016import java.io.InputStreamReader; 017import java.util.Arrays; 018 019import org.kew.rmf.transformers.RomanNumeralTransformer; 020import org.kew.rmf.transformers.collations.CollationStructureTransformer; 021 022public class CollationUtils { 023 024 public static int SERIES_INDEX = 0; 025 public static int VOL_INDEX = 1; 026 public static int ISSUE_INDEX = 2; 027 public static int PAGE_INDEX = 3; 028 public static int TAB_OR_FIG_INDEX = 4; 029 public static int YEAR_INDEX = 5; 030 public static int RULE_INDEX = 6; 031 032 private static String convertRoman(String r){ 033 return new RomanNumeralTransformer().transform(r); 034 } 035 036 private static String[] splitCollation(String collation){ 037 return CollationStructureTransformer.splitCollation(collation); 038 } 039 040 public static String assessCollationStructure(String collation){ 041 return CollationStructureTransformer.assessCollationStructure(collation); 042 } 043 044 public static boolean parsableCollation(String collation){ 045 return !Arrays.toString(parseCollation(collation)).equals("[, , , , , , ]"); 046 } 047 public static String[] parseCollation(String collation){ 048 String pattern = assessCollationStructure(collation); 049 String series=""; 050 String volume=""; 051 String issue=""; 052 String page=""; 053 String tab_or_fig_or_no=""; 054 String year=""; 055 String rule=""; 056 057 if (pattern.equals("d: d") 058 || pattern.equals("d: r") 059 || pattern.equals("yyyy: d") 060 || pattern.equals("d: yyyy") 061 || pattern.equals("yyyy: yyyy")){ 062 //Example: "12: 89" 063 //Series: "" 064 series = ""; 065 //Volume: "12" 066 volume = splitCollation(collation)[0]; 067 //Issue: "" 068 issue = ""; 069 //Page: "89" 070 page = splitCollation(collation)[1]; 071 //Tab/Fig/No: "" 072 tab_or_fig_or_no = ""; 073 //Year: "" 074 year = ""; 075 rule = "1"; 076 } 077 if (pattern.equals(", d: d") || pattern.equals(", yyyy: d") || pattern.equals(", d: yyyy") || pattern.equals(", yyyy: yyyy")){ 078 //Example: ", 12: 89" 079 //Series: "" 080 series = ""; 081 //Volume: "12" 082 volume = convertRoman(splitCollation(collation)[1]); 083 //Issue: "" 084 issue = ""; 085 //Page: "89" 086 page = convertRoman(splitCollation(collation)[2]); 087 //Tab/Fig/No: "" 088 tab_or_fig_or_no = ""; 089 //Year: "" 090 year = ""; 091 rule = "1.1"; 092 } 093 if (pattern.equals(": d") || pattern.equals(": yyyy") || pattern.equals(": r")){ 094 //Example: ": 121" 095 //Series: "" 096 series = ""; 097 //Volume: "" 098 volume = ""; 099 //Issue: "" 100 issue = ""; 101 //Page: "121" 102 page = collation.split(" ")[1]; 103 //Tab/Fig/No: "" 104 tab_or_fig_or_no = ""; 105 //Year: "" 106 year = ""; 107 rule = "2"; 108 } 109 if (pattern.equals("d(d): d") || pattern.equals("d(d): yyyy") || pattern.equals("yyyy(d): d")){ 110 //Example: "12(2): 61" 111 //Series: "" 112 series = ""; 113 //Volume: "12" 114 volume = splitCollation(collation)[0]; 115 //Issue: "2" 116 issue = splitCollation(collation)[1]; 117 //Page: "61" 118 page = splitCollation(collation)[2]; 119 //Tab/Fig/No: "" 120 tab_or_fig_or_no = ""; 121 //Year: "" 122 year = ""; 123 rule = "3"; 124 } 125 if (pattern.equals("d: d (yyyy)")){ 126 //Example: "12: 99 (1898)" 127 //Series: "" 128 series = ""; 129 //Volume: "12" 130 volume = splitCollation(collation)[0]; 131 //Issue: "" 132 issue = ""; 133 //Page: "99" 134 page = splitCollation(collation)[1]; 135 //Tab/Fig/No: "" 136 tab_or_fig_or_no = ""; 137 //Year: "1898" 138 year = splitCollation(collation)[2]; 139 rule = "3"; 140 } 141 if (pattern.equals(", r, d: d")){ 142 //Example: ", III, 102: 414" 143 //Series: "III" 144 series = splitCollation(collation)[1]; 145 //Volume: "102" 146 volume = splitCollation(collation)[2]; 147 //Issue: "" 148 issue = ""; 149 //Page: "4141" 150 page = splitCollation(collation)[3]; 151 //Tab/Fig/No: "" 152 tab_or_fig_or_no = ""; 153 //Year: "" 154 year = ""; 155 rule = "65.1"; 156 } 157 if (pattern.equals(", a. d: d")){ 158 //Example: ", ed. 3: 273" 159 // if "a" = "ed" OR "Nachtr" OR "Suppl" OR "Beib" OR "Beih" OR "Reimpr" 160 if (splitCollation(collation)[1].equals("ed") 161 || splitCollation(collation)[1].equals("Nachtr") 162 || splitCollation(collation)[1].equals("Suppl") 163 || splitCollation(collation)[1].equals("Beib") 164 || splitCollation(collation)[1].equals("Beih") 165 || splitCollation(collation)[1].equals("Reimpr")) 166 //Series: "ed. 3" 167 series = splitCollation(collation)[1] + ". " + splitCollation(collation)[2]; 168 //Volume: "" 169 volume = ""; 170 //Issue: "" 171 issue = ""; 172 //Page: "2731" 173 page = splitCollation(collation)[3]; 174 //Tab/Fig/No: "" 175 tab_or_fig_or_no = ""; 176 //Year: "" 177 year = ""; 178 rule = "65.2"; 179 } 180 if (pattern.equals(", a.a., d: d")){ 181 //Example: ", a.s., 3: 508" 182 // if "a" = "a.s." OR "n.f." OR "n.s." 183 if ((splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("a.s.") 184 || (splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("n.f.") 185 || (splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("n.s.")) 186 series = splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"."; 187 //Volume: "3" 188 volume = splitCollation(collation)[3]; 189 //Issue: "" 190 issue = ""; 191 //Page: "508" 192 page = splitCollation(collation)[4]; 193 //Tab/Fig/No: "" 194 tab_or_fig_or_no = ""; 195 //Year: "" 196 year = ""; 197 rule = "65.3"; 198 } 199 if (pattern.equals(", a.a., d(d): d")){ 200 //Example: ", a.s., 3(2): 508" 201 // if "a" = "a.s." OR "n.f." OR "n.s." 202 if ((splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("a.s.") 203 || (splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("n.f.") 204 || (splitCollation(collation)[1]+"."+splitCollation(collation)[2]+".").equals("n.s.")) 205 series = splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"."; 206 //Volume: "3" 207 volume = splitCollation(collation)[3]; 208 //Issue: "" 209 issue = splitCollation(collation)[4]; 210 //Page: "508" 211 page = splitCollation(collation)[5]; 212 //Tab/Fig/No: "" 213 tab_or_fig_or_no = ""; 214 //Year: "" 215 year = ""; 216 rule = "65.12"; 217 } 218 if (pattern.equals(", a. d, d: d")){ 219 String[] c=splitCollation(collation.replaceAll(" ", "")); 220 if (c[1].equals("ed")){ 221 //Example: ", ed. 9, 1: 197" 222 series = c[1]+". "+c[2]; 223 //Volume: "3" 224 volume = c[3]; 225 //Issue: "" 226 issue = ""; 227 //Page: "508" 228 page = c[4]; 229 //Tab/Fig/No: "" 230 tab_or_fig_or_no = ""; 231 //Year: "" 232 year = ""; 233 rule = "65.4"; 234 } 235 else{ 236 series=c[2]; 237 volume=c[3]; 238 page=c[4]; 239 rule="65.4a"; 240 } 241 } 242 if (pattern.equals(", a. d, d(d): d")){ 243 String[] c=splitCollation(collation.replaceAll(" ", "")); 244 if (c[1].equals("ed")){ 245 //Example: ", ed. 9, 1(1): 197" 246 series = c[1]+". "+c[2]; 247 //Volume: "3" 248 volume = c[3]; 249 //Issue: "" 250 issue = c[4]; 251 //Page: "508" 252 page = c[5]; 253 //Tab/Fig/No: "" 254 tab_or_fig_or_no = ""; 255 //Year: "" 256 year = ""; 257 rule = "65.49"; 258 } 259 else{ 260 series=c[2]; 261 volume=c[3]; 262 issue=c[4]; 263 page=c[5]; 264 rule="65.49a"; 265 } 266 } 267 268 if (pattern.equals(", r, d(d): d")){ 269 // Example: ", II, 1(1): 28" 270 //Series: "" 271 series = splitCollation(collation)[1]; 272 //Volume: "C" 273 volume = splitCollation(collation)[2]; 274 //Issue: "" 275 issue = splitCollation(collation)[3]; 276 //Page: "433" 277 page = splitCollation(collation)[4]; 278 //Tab/Fig/No: "" 279 tab_or_fig_or_no = ""; 280 //Year: "" 281 year = ""; 282 rule = "65.5"; 283 } 284 // 1(Conif.): 42 d(a.): d 1 (Conif.) 42 TRUE 65.6 285 if (pattern.equals("d(a.): d")){ 286 // Example: "1(Conif.): 42" 287 //Series: "" 288 series = ""; 289 //Volume: "1(Conif.)" 290 volume = collation.split(":")[0]; 291 //Issue: "" 292 issue = ""; 293 //Page: "42" 294 page = splitCollation(collation)[2]; 295 //Tab/Fig/No: "" 296 tab_or_fig_or_no = ""; 297 //Year: "" 298 year = ""; 299 rule = "65.6"; 300 } 301 // , Aloac.: 41 , a.: d 41 true 65.8 302 if (pattern.equals(", a.: d")){ 303 // Example: ", Aloac.: 41" 304 //Series: "" 305 series = ""; 306 //Volume: "" 307 volume = ""; 308 //Issue: "" 309 issue = ""; 310 //Page: "41" 311 page = splitCollation(collation)[2]; 312 //Tab/Fig/No: "" 313 tab_or_fig_or_no = ""; 314 //Year: "" 315 year = ""; 316 rule = "65.8"; 317 } 318 // 65.9 - see rule 3 319 320 if (pattern.equals(": a. d") && splitCollation(collation)[1].equals("t")){ 321 //Example: : t. 10 322 //Series: 323 series=""; 324 //Volume: 325 volume=""; 326 //Issue: 327 issue=""; 328 //Page: 329 page=""; 330 //Tab/Fig/Pl/Nos: t. 10 331 tab_or_fig_or_no="t. " + splitCollation(collation)[2]; 332 //Year: 333 year=""; 334 //Rule: 67 335 rule="67"; 336 } 337 if ((pattern.equals("d: a. d") || pattern.equals("d: a. yyyy") )&& splitCollation(collation)[1].equals("t")){ 338 //Example: 13: t. 1291 339 //Series: 340 series=""; 341 //Volume: 13 342 volume=splitCollation(collation)[0]; 343 //Issue: 344 issue=""; 345 //Page: 346 page=""; 347 //Tab/Fig/Pl/Nos: t. 1291 348 tab_or_fig_or_no="t. " + splitCollation(collation)[2]; 349 //Year: 350 year=""; 351 //Rule: 7 352 rule="7"; 353 } 354 if (pattern.equals("d(a. d): d")){ 355 //Example: 19(Suppl. 1): 131 356 //Series: 357 series=""; 358 //Volume: 19(Suppl. 1) 359 volume=collation.split(":")[0]; 360 //Issue: 361 issue=""; 362 //Page: 131 363 page=splitCollation(collation)[3]; 364 //Tab/Fig/Pl/Nos: 365 tab_or_fig_or_no=""; 366 //Year: 367 year=""; 368 //Rule: 65.13 369 rule="65.13"; 370 } 371 if (pattern.equals("d(d, a.): d")){ 372 //Example: 118(1291, Suppl.): 61 373 //Series: 374 series=""; 375 //Volume: 118 376 volume=splitCollation(collation)[0]; 377 //Issue: 1291, Suppl. 378 issue=splitCollation(collation)[1] + ", " + splitCollation(collation)[2] + "."; 379 //Page: 61 380 page=splitCollation(collation)[3]; 381 //Tab/Fig/Pl/Nos: 382 tab_or_fig_or_no=""; 383 //Year: 384 year=""; 385 //Rule: 65.14 386 rule="65.14"; 387 } 388 if (pattern.equals("d: d, d")){ 389 //Example: 2: 128, 331 390 //Series: 391 series=""; 392 //Volume: 2 393 volume=splitCollation(collation)[0]; 394 //Issue: 395 issue=""; 396 //Page: 128, 331 397 page=splitCollation(collation)[1] + ", " + splitCollation(collation)[2]; 398 //Tab/Fig/Pl/Nos: 399 tab_or_fig_or_no=""; 400 //Year: 401 year=""; 402 //Rule: 65.15 403 rule="65.15"; 404 } 405 if (pattern.equals("d(d) a: d") && splitCollation(collation)[2].equals("cppo")){ 406 //Example: 93(1097) cppo: 12 407 //Series: 408 series=""; 409 //Volume: 93 410 volume=splitCollation(collation)[0]; 411 //Issue: 1097 cppo 412 issue=splitCollation(collation)[1] + " " + splitCollation(collation)[2]; 413 //Page: 12 414 page=splitCollation(collation)[3]; 415 //Tab/Fig/Pl/Nos: 416 tab_or_fig_or_no=""; 417 //Year: 418 year=""; 419 //Rule: 65.17 420 rule="65.17"; 421 } 422 if (pattern.equals(", d(d): d")){ 423 //Example: , 54(5): 88 424 //Series: 425 series=""; 426 //Volume: 54 427 volume=splitCollation(collation)[1]; 428 //Issue: 5 429 issue=splitCollation(collation)[2]; 430 //Page: 88 431 page=splitCollation(collation)[3]; 432 //Tab/Fig/Pl/Nos: 433 tab_or_fig_or_no=""; 434 //Year: 435 year=""; 436 //Rule: 65.18 437 rule="65.18"; 438 } 439 if (pattern.equals(", r: d")){ 440 //Example: , C: 96 441 //Series: C 442 series=splitCollation(collation)[1]; 443 //Volume: 444 volume=""; 445 //Issue: 446 issue=""; 447 //Page: 96 448 page=splitCollation(collation)[2];; 449 //Tab/Fig/Pl/Nos: 450 tab_or_fig_or_no=""; 451 //Year: 452 year=""; 453 //Rule: 65.19 454 rule="65.19"; 455 } 456 if (pattern.equals("d(d a.): d")){ 457 //Example: 40(1 Anh.): 88 458 //Series: 459 series=""; 460 //Volume: 40 461 volume=splitCollation(collation)[0]; 462 //Issue: 1 Anh. 463 issue=splitCollation(collation)[1] + " " + splitCollation(collation)[2] + "."; 464 //Page: 88 465 page=splitCollation(collation)[3]; 466 //Tab/Fig/Pl/Nos: 467 tab_or_fig_or_no=""; 468 //Year: 469 year=""; 470 //Rule: 65.22 471 rule="65.22"; 472 } 473 if (pattern.equals(", a.a., a.a., d: d")){ 474 //Example: , n.s., f.m., 1: 107 475 //Series: n.s., f.m. 476 series=splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"., "+splitCollation(collation)[3]+"."+splitCollation(collation)[4]+"."; 477 //Volume: 1 478 volume=splitCollation(collation)[5]; 479 //Issue: 480 issue=""; 481 //Page: 107 482 page=splitCollation(collation)[6]; 483 //Tab/Fig/Pl/Nos: 484 tab_or_fig_or_no=""; 485 //Year: 486 year=""; 487 //Rule: 65.23 488 rule="65.23"; 489 } 490 if (pattern.equals(", a d: d")){ 491 //Example: , Texte 3: 1149 492 //Series: 493 series=""; 494 //Volume: Texte 3 495 volume=splitCollation(collation)[1] + " " + splitCollation(collation)[2]; 496 //Issue: 497 issue=""; 498 //Page: 1149 499 page=splitCollation(collation)[3]; 500 //Tab/Fig/Pl/Nos: 501 tab_or_fig_or_no=""; 502 //Year: 503 year=""; 504 //Rule: 65.24 505 rule="65.24"; 506 } 507 if (pattern.equals("d: d, a. d")){ 508 //Example: 9: 388, t. 116 509 //Series: 510 series=""; 511 //Volume: 9 512 volume=splitCollation(collation)[0]; 513 //Issue: 514 issue=""; 515 //Page: 388 516 page=splitCollation(collation)[1]; 517 //Tab/Fig/Pl/Nos: t. 116 518 tab_or_fig_or_no=splitCollation(collation)[2]+". "+splitCollation(collation)[3]; 519 //Year: 520 year=""; 521 //Rule: 65.25 522 rule="65.25"; 523 } 524 if (pattern.equals(": d, d")){ 525 //Example: : 487, 701 526 //Series: 527 series=""; 528 //Volume: 529 volume=""; 530 //Issue: 531 issue=""; 532 //Page: 487, 701 533 page=splitCollation(collation)[1]+", "+splitCollation(collation)[2]; 534 //Tab/Fig/Pl/Nos: 535 tab_or_fig_or_no=""; 536 //Year: 537 year=""; 538 //Rule: 65.26 539 rule="65.26"; 540 } 541 if (pattern.equals("yyyy(a.): d")){ 542 //Example: 1857(App.): 4 543 //Series: 544 series=""; 545 //Volume: 1857(App.) 546 volume=collation.split(": ")[0]; 547 //Issue: 548 issue=""; 549 //Page: 4 550 page=collation.split(": ")[1]; 551 //Tab/Fig/Pl/Nos: 552 tab_or_fig_or_no=""; 553 //Year: 554 year=""; 555 //Rule: 65.29 556 rule="65.29"; 557 } 558 if (pattern.equals(", a. d, d(d): d")){ 559 if (splitCollation(collation)[1].equals("ed")){ 560 //Example: , ed. 3, 1(11-12): 677 561 //Series: ed. 3 562 series=splitCollation(collation)[1]+". "+splitCollation(collation)[2]; 563 //Volume: 1 564 volume=splitCollation(collation)[3]; 565 //Issue: 11-12 566 issue=splitCollation(collation)[4]; 567 //Page: 677 568 page=splitCollation(collation)[5]; 569 //Tab/Fig/Pl/Nos: 570 tab_or_fig_or_no=""; 571 //Year: 572 year=""; 573 //Rule: 65.30 574 rule="65.30"; 575 } 576 else{ 577 //Example: , ser. 3, 1(11-12): 677 578 //Series: 3 579 series=splitCollation(collation)[2]; 580 //Volume: 1 581 volume=splitCollation(collation)[3]; 582 //Issue: 11-12 583 issue=splitCollation(collation)[4]; 584 //Page: 677 585 page=splitCollation(collation)[5]; 586 //Tab/Fig/Pl/Nos: 587 tab_or_fig_or_no=""; 588 //Year: 589 year=""; 590 //Rule: 65.30 591 rule="65.30a"; 592 } 593 } 594 if (pattern.equals("yyyy-d: d")){ 595 //Example: 2012-39: 35 596 //Series: 597 series=""; 598 //Volume: 2012-39 599 volume=collation.split(": ")[0]; 600 //Issue: 601 issue=""; 602 //Page: 35 603 page=collation.split(": ")[1]; 604 //Tab/Fig/Pl/Nos: 605 tab_or_fig_or_no=""; 606 //Year: 607 year=""; 608 //Rule: 65.31 609 rule="65.31"; 610 } 611 if (pattern.equals(", a. a.: d") && (splitCollation(collation)[1]+". "+splitCollation(collation)[2]).equals("Spec. No")){ 612 //Example: , Spec. No.: 566 613 //Series: 614 series=""; 615 //Volume: 616 volume=""; 617 //Issue: 618 issue=""; 619 //Page: 620 page=""; 621 //Tab/Fig/Pl/Nos: Spec. No. 566 622 tab_or_fig_or_no="Spec. No. " + splitCollation(collation)[3]; 623 //Year: 624 year=""; 625 //Rule: 65.32 626 rule="65.32"; 627 } 628 if (pattern.equals(", a. a.: d") && !(splitCollation(collation)[1]+". "+splitCollation(collation)[2]).equals("Spec. No")){ 629 //Example: , ed. rev.: 1439 630 //Series: ed. rev. 631 series=splitCollation(collation)[1]+". "+splitCollation(collation)[2]+"."; 632 //Volume: 633 volume=""; 634 //Issue: 635 issue=""; 636 //Page: 1439 637 page=splitCollation(collation)[3]; 638 //Tab/Fig/Pl/Nos: 639 tab_or_fig_or_no=""; 640 //Year: 641 year=""; 642 //Rule: 65.33 643 rule="65.33"; 644 } 645 if (pattern.equals(", r, d r a d: d")){ 646 //Example: , IV, 50 II B 21: 242 647 //Series: 4 648 series=splitCollation(collation)[1]; 649 //Volume: 50 II B 21 650 volume=splitCollation(collation)[2]+" "+splitCollation(collation)[3]+" "+splitCollation(collation)[4]+" "+splitCollation(collation)[5]; 651 //Issue: 652 issue=""; 653 //Page: 242 654 page=splitCollation(collation)[6]; 655 //Tab/Fig/Pl/Nos: 656 tab_or_fig_or_no=""; 657 //Year: 658 year=""; 659 //Rule: 65.34 660 rule="65.34"; 661 } 662 if (pattern.equals(", a. d(d): d")){ 663 //Example: , Suppl. 115(1276): 18 664 //Series: 665 series=""; 666 //Volume: Suppl. 115 667 volume=splitCollation(collation)[1]+". "+splitCollation(collation)[2]; 668 //Issue: 1276 669 issue=splitCollation(collation)[3]; 670 //Page: 18 671 page=splitCollation(collation)[4]; 672 //Tab/Fig/Pl/Nos: 673 tab_or_fig_or_no=""; 674 //Year: 675 year=""; 676 //Rule: 65.35 677 rule="65.35"; 678 } 679 if (pattern.equals(", a.a., yyyy(d): d")){ 680 //Example: , n.s., 1883(2): 328 681 //Series: n.s. 682 series=splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"."; 683 //Volume: 1883 684 volume=splitCollation(collation)[3]; 685 //Issue: 2 686 issue=splitCollation(collation)[4]; 687 //Page: 328 688 page=splitCollation(collation)[5]; 689 //Tab/Fig/Pl/Nos: 690 tab_or_fig_or_no=""; 691 //Year: 692 year=""; 693 //Rule: 65.36 694 rule="65.36"; 695 } 696 if (pattern.equals(", r, d: a. d")){ 697 //Example: , V, 7: t. 44 698 //Series: 5 699 series=splitCollation(collation)[1]; 700 //Volume: 7 701 volume=splitCollation(collation)[2]; 702 //Issue: 703 issue=""; 704 //Page: 705 page=""; 706 //Tab/Fig/Pl/Nos: t. 44 707 tab_or_fig_or_no=splitCollation(collation)[3]+". "+splitCollation(collation)[4]; 708 //Year: 709 year=""; 710 //Rule: 70 711 rule="70"; 712 } 713 if (pattern.equals("d(d, a.): d")){ 714 //Example: 120(1299, Suppl.): 49 715 //Series: 716 series=""; 717 //Volume: 120 718 volume=splitCollation(collation)[0]; 719 //Issue: 1299, Suppl. 720 issue=splitCollation(collation)[1]+", "+splitCollation(collation)[2]+"."; 721 //Page: 49 722 page=splitCollation(collation)[3]; 723 //Tab/Fig/Pl/Nos: 724 tab_or_fig_or_no=""; 725 //Year: 726 year=""; 727 //Rule: 65.37 728 rule="65.37"; 729 } 730 if (pattern.equals("yyyy-yyyy: d")){ 731 //Example: 1927-1929: 9 732 //Series: 733 series=""; 734 //Volume: 1927-1929 735 volume=collation.split(": ")[0]; 736 //Issue: 737 issue=""; 738 //Page: 9 739 page=collation.split(": ")[1]; 740 //Tab/Fig/Pl/Nos: 741 tab_or_fig_or_no=""; 742 //Year: 743 year=""; 744 //Rule: 65.38 745 rule="65.38"; 746 } 747 if (pattern.equals("d(d; d): d")){ 748 //Example: 24(3; 10): 23 749 //Series: 750 series=""; 751 //Volume: 24 752 volume=splitCollation(collation)[0]; 753 //Issue: 3; 10 754 issue=splitCollation(collation)[1]+"; "+splitCollation(collation)[2]; 755 //Page: 23 756 page=splitCollation(collation)[3]; 757 //Tab/Fig/Pl/Nos: 758 tab_or_fig_or_no=""; 759 //Year: 760 year=""; 761 //Rule: 65.39 762 rule="65.39"; 763 } 764 if (pattern.equals("d(a. a.): d")){ 765 //Example: 9(Suppl. Bot.): 33 766 //Series: 767 series=""; 768 //Volume: 9(Suppl. Bot.) 769 volume=collation.split(": ")[0]; 770 //Issue: 771 issue=""; 772 //Page: 33 773 page=collation.split(": ")[1]; 774 //Tab/Fig/Pl/Nos: 775 tab_or_fig_or_no=""; 776 //Year: 777 year=""; 778 //Rule: 65.40 779 rule="65.40"; 780 } 781 if (pattern.equals("d: a.\u00ba d") 782 || pattern.equals("d: a.\u00ba da") 783 || pattern.equals("d: a.\u00ba yyyy") 784 ||pattern.equals("yyyy: a.\u00ba d") 785 ||pattern.equals("yyyy: a.\u00ba da") 786 ||pattern.equals("yyyy: a.\u00ba yyyy")){ 787 //Example: 17: n.o 4 788 //Series: 789 series=""; 790 //Volume: 17 791 volume=splitCollation(collation)[0]; 792 //Issue: 793 issue=""; 794 //Page: 795 page=""; 796 //Tab/Fig/Pl/Nos: n.o 4 797 tab_or_fig_or_no="n.\u00ba " + splitCollation(collation)[2]; 798 //Year: 799 year=""; 800 //Rule: 73 801 rule="73"; 802 } 803 if (pattern.equals(": a.\u00ba d") 804 || pattern.equals(": a.\u00ba yyyy") 805 || pattern.equals(": a.\u00ba da") 806 || pattern.equals(": a.\u00ba yyyya")){ 807 //Example: ": n.o 4" 808 //Series: 809 series=""; 810 //Volume: 811 volume=""; 812 //Issue: 813 issue=""; 814 //Page: 815 page=""; 816 //Tab/Fig/Pl/Nos: n.o 4 817 tab_or_fig_or_no="n.\u00ba " + splitCollation(collation)[2]; 818 //Year: 819 year=""; 820 //Rule: 73 821 rule="73.1"; 822 } 823 824 825 if (pattern.equals(", r, yyyy(d): d")){ 826 //Example: , III, 1893(1): 413 827 //Series: 3 828 series=splitCollation(collation)[1]; 829 //Volume: 1893 830 volume=splitCollation(collation)[2]; 831 //Issue: 1 832 issue=splitCollation(collation)[3]; 833 //Page: 413 834 page=splitCollation(collation)[4]; 835 //Tab/Fig/Pl/Nos: 836 tab_or_fig_or_no=""; 837 //Year: 838 year=""; 839 //Rule: 65.41 840 rule="65.41"; 841 } 842 if (pattern.equals("d: a.a.")){ 843 //Example: 339: s.p. 844 //Series: 845 series=""; 846 //Volume: 339 847 volume=splitCollation(collation)[0]; 848 //Issue: 849 issue=""; 850 //Page: s.p. 851 page=splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"."; 852 //Tab/Fig/Pl/Nos: 853 tab_or_fig_or_no=""; 854 //Year: 855 year=""; 856 //Rule: 74 857 rule="74"; 858 } 859 if (pattern.equals("d: a. d, a. d")){ 860 //Example: 47: t. 2113, p. 5 861 //Series: 862 series=""; 863 //Volume: 47 864 volume=splitCollation(collation)[0]; 865 //Issue: 866 issue=""; 867 //Page: t. 2113, p. 5 868 // CHECK!! 869 page=collation.split(": ")[1]; 870 //Tab/Fig/Pl/Nos: 871 tab_or_fig_or_no=""; 872 //Year: 873 year=""; 874 //Rule: 75 875 rule="75"; 876 } 877 if (pattern.equals(", a. a. a.: d")){ 878 //Example: , Prodr. Fl. Cap.: 5 879 //Series: Prodr. Fl. Cap. 880 series=splitCollation(collation)[1]+". "+splitCollation(collation)[2]+". "+splitCollation(collation)[3]+"."; 881 //Volume: 882 volume=""; 883 //Issue: 884 issue=""; 885 //Page: 5 886 page=splitCollation(collation)[4]; 887 //Tab/Fig/Pl/Nos: 888 tab_or_fig_or_no=""; 889 //Year: 890 year=""; 891 //Rule: 65.42 892 rule="65.42"; 893 } 894 if (pattern.equals("d(d: d): d")){ 895 //Example: 9(225: 1): 267 896 //Series: 897 series=""; 898 //Volume: 9 899 volume=splitCollation(collation)[0]; 900 //Issue: 255:1 901 issue=splitCollation(collation)[1]+":"+splitCollation(collation)[2]; 902 //Page: 267 903 page=splitCollation(collation)[3]; 904 //Tab/Fig/Pl/Nos: 905 tab_or_fig_or_no=""; 906 //Year: 907 year=""; 908 //Rule: 65.44 909 rule="65.44"; 910 } 911 if (pattern.equals(", a: d")){ 912 //Example: , Atlas: 13 913 //Series: Atlas 914 series=splitCollation(collation)[1]; 915 //Volume: 916 volume=""; 917 //Issue: 918 issue=""; 919 //Page: 13 920 page=splitCollation(collation)[2]; 921 //Tab/Fig/Pl/Nos: 922 tab_or_fig_or_no=""; 923 //Year: 924 year=""; 925 //Rule: 65.45 926 rule="65.45"; 927 } 928 if (pattern.equals("d(d): a. d")){ 929 //Example: 1(2): t. 20 930 //Series: 931 series=""; 932 //Volume: 1 933 volume=splitCollation(collation)[0]; 934 //Issue: 2 935 issue=splitCollation(collation)[1]; 936 //Page: 937 page=""; 938 //Tab/Fig/Pl/Nos: t. 20 939 tab_or_fig_or_no=splitCollation(collation)[2]+". "+splitCollation(collation)[3]; 940 //Year: 941 year=""; 942 //Rule: 77 943 rule="77"; 944 } 945 if (pattern.equals(", a.a., yyyy: d")){ 946 //Example: , n.s., 1875: 162 947 //Series: n.s. 948 series=splitCollation(collation)[1]+"."+splitCollation(collation)[2]+"."; 949 //Volume: 1875 950 volume=splitCollation(collation)[3]; 951 //Issue: 952 issue=""; 953 //Page: 162 954 page=splitCollation(collation)[4]; 955 //Tab/Fig/Pl/Nos: 956 tab_or_fig_or_no=""; 957 //Year: 958 year=""; 959 //Rule: 65.46 960 rule="65.46"; 961 } 962 if (pattern.equals(", r, yyyy: d")){ 963 //Example: , III, 1893: 629 964 //Series: 3 965 series=splitCollation(collation)[1]; 966 //Volume: 1893 967 volume=splitCollation(collation)[2]; 968 //Issue: 969 issue=""; 970 //Page: 629 971 page=splitCollation(collation)[3]; 972 //Tab/Fig/Pl/Nos: 973 tab_or_fig_or_no=""; 974 //Year: 975 year=""; 976 //Rule: 65.47 977 rule="65.47"; 978 } 979 if (pattern.equals("d(a): d")){ 980 //Example: 15(Extra): 408 981 //Series: 982 series=""; 983 //Volume: 15(Extra) 984 volume=collation.split(": ")[0]; 985 //Issue: 986 issue=""; 987 //Page: 408 988 page=collation.split(": ")[1]; 989 //Tab/Fig/Pl/Nos: 990 tab_or_fig_or_no=""; 991 //Year: 992 year=""; 993 //Rule: 65.50 994 rule="65.50"; 995 } 996 if (pattern.equals(", r, d r: d")){ 997 //Example: , IV, 243 II: 95 998 //Series: 4 999 series=splitCollation(collation)[1]; 1000 //Volume: 243 II 1001 volume=splitCollation(collation)[2]+" "+splitCollation(collation)[3]; 1002 //Issue: 1003 issue=""; 1004 //Page: 95 1005 page=splitCollation(collation)[4]; 1006 //Tab/Fig/Pl/Nos: 1007 tab_or_fig_or_no=""; 1008 //Year: 1009 year=""; 1010 //Rule: 65.51 1011 rule="65.51"; 1012 } 1013 if (pattern.equals("d:")){ 1014 //Example: 7: 1015 //Series: 1016 series=""; 1017 //Volume: 7 1018 volume=splitCollation(collation)[0]; 1019 //Issue: 1020 issue=""; 1021 //Page: 1022 page=""; 1023 //Tab/Fig/Pl/Nos: 1024 tab_or_fig_or_no=""; 1025 //Year: 1026 year=""; 1027 //Rule: 65.53 1028 rule="65.53"; 1029 } 1030 1031 String [] values = {series, volume, issue, page, tab_or_fig_or_no, year, rule}; 1032 return values; 1033 } 1034 1035 public static void main(String[] args) { 1036 // Tab separated file of id and collation 1037 String inputfile = args[0]; 1038 String outputfile = args[1]; 1039 1040 try(BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile),"utf8")); 1041 FileWriter fw = new FileWriter(outputfile); 1042 BufferedWriter bw = new BufferedWriter(fw);) { 1043 int count = 0; 1044 1045 1046 String line = null; 1047 while ((line = br.readLine()) != null) { 1048 if ((count++ % 10000) == 0){ 1049 System.out.println(count); 1050 } 1051 String[] elems = line.split("\t"); 1052 String id = elems[0]; 1053 if (elems.length > 1){ 1054 String collation = elems[1]; 1055 1056 String structure = new CollationStructureTransformer().transform(collation); 1057 1058 String[] parsed = CollationUtils.parseCollation(collation); 1059 1060 bw.write(id 1061 + "\t" + collation 1062 + "\t" + structure 1063 + "\t" + parsed[SERIES_INDEX] 1064 + "\t" + parsed[VOL_INDEX] 1065 + "\t" + parsed[ISSUE_INDEX] 1066 + "\t" + parsed[PAGE_INDEX] 1067 + "\t" + parsed[TAB_OR_FIG_INDEX] 1068 + "\t" + parsed[YEAR_INDEX] 1069 + "\t" + CollationUtils.parsableCollation(collation) 1070 + "\t" + parsed[RULE_INDEX] 1071 + "\n"); 1072 } 1073 else{ 1074 bw.write(id + "\n"); 1075 } 1076 } 1077 bw.flush(); 1078 bw.close(); 1079 } 1080 catch(Exception e){ 1081 e.printStackTrace(); 1082 } 1083 } 1084 1085}