001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.csv; 019 020import static org.apache.commons.csv.Token.Type.TOKEN; 021 022import java.io.Closeable; 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.Reader; 029import java.io.StringReader; 030import java.net.URL; 031import java.nio.charset.Charset; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.util.ArrayList; 035import java.util.Arrays; 036import java.util.Collections; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Map; 041import java.util.NoSuchElementException; 042import java.util.TreeMap; 043 044/** 045 * Parses CSV files according to the specified format. 046 * 047 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 048 * specification of a {@link CSVFormat}. 049 * 050 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. 051 * 052 * <h2>Creating instances</h2> 053 * <p> 054 * There are several static factory methods that can be used to create instances for various types of resources: 055 * </p> 056 * <ul> 057 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 058 * <li>{@link #parse(String, CSVFormat)}</li> 059 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 060 * </ul> 061 * <p> 062 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 063 * 064 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 065 * </p> 066 * <pre> 067 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { 068 * ... 069 * } 070 * </pre> 071 * 072 * <h2>Parsing record wise</h2> 073 * <p> 074 * To parse a CSV input from a file, you write: 075 * </p> 076 * 077 * <pre> 078 * File csvData = new File("/path/to/csv"); 079 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 080 * for (CSVRecord csvRecord : parser) { 081 * ... 082 * } 083 * </pre> 084 * 085 * <p> 086 * This will read the parse the contents of the file using the 087 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 088 * </p> 089 * 090 * <p> 091 * To parse CSV input in a format like Excel, you write: 092 * </p> 093 * 094 * <pre> 095 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 096 * for (CSVRecord csvRecord : parser) { 097 * ... 098 * } 099 * </pre> 100 * 101 * <p> 102 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about 103 * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. 104 * </p> 105 * 106 * <h2>Parsing into memory</h2> 107 * <p> 108 * If parsing record wise is not desired, the contents of the input can be read completely into memory. 109 * </p> 110 * 111 * <pre> 112 * Reader in = new StringReader("a;b\nc;d"); 113 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 114 * List<CSVRecord> list = parser.getRecords(); 115 * </pre> 116 * 117 * <p> 118 * There are two constraints that have to be kept in mind: 119 * </p> 120 * 121 * <ol> 122 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 123 * the input, those records will not end up in the in memory representation of your CSV data.</li> 124 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're 125 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 126 * </ol> 127 * 128 * <h2>Notes</h2> 129 * <p> 130 * Internal parser state is completely covered by the format and the reader-state. 131 * </p> 132 * 133 * @see <a href="package-summary.html">package documentation for more details</a> 134 */ 135public final class CSVParser implements Iterable<CSVRecord>, Closeable { 136 137 class CSVRecordIterator implements Iterator<CSVRecord> { 138 private CSVRecord current; 139 140 private CSVRecord getNextRecord() { 141 try { 142 return CSVParser.this.nextRecord(); 143 } catch (final IOException e) { 144 throw new IllegalStateException( 145 e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); 146 } 147 } 148 149 @Override 150 public boolean hasNext() { 151 if (CSVParser.this.isClosed()) { 152 return false; 153 } 154 if (this.current == null) { 155 this.current = this.getNextRecord(); 156 } 157 158 return this.current != null; 159 } 160 161 @Override 162 public CSVRecord next() { 163 if (CSVParser.this.isClosed()) { 164 throw new NoSuchElementException("CSVParser has been closed"); 165 } 166 CSVRecord next = this.current; 167 this.current = null; 168 169 if (next == null) { 170 // hasNext() wasn't called before 171 next = this.getNextRecord(); 172 if (next == null) { 173 throw new NoSuchElementException("No more CSV records available"); 174 } 175 } 176 177 return next; 178 } 179 180 @Override 181 public void remove() { 182 throw new UnsupportedOperationException(); 183 } 184 } 185 186 /** 187 * Creates a parser for the given {@link File}. 188 * 189 * @param file 190 * a CSV file. Must not be null. 191 * @param charset 192 * A Charset 193 * @param format 194 * the CSVFormat used for CSV parsing. Must not be null. 195 * @return a new parser 196 * @throws IllegalArgumentException 197 * If the parameters of the format are inconsistent or if either file or format are null. 198 * @throws IOException 199 * If an I/O error occurs 200 */ 201 @SuppressWarnings("resource") 202 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 203 Assertions.notNull(file, "file"); 204 Assertions.notNull(format, "format"); 205 return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); 206 } 207 208 /** 209 * Creates a CSV parser using the given {@link CSVFormat}. 210 * 211 * <p> 212 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 213 * unless you close the {@code reader}. 214 * </p> 215 * 216 * @param inputStream 217 * an InputStream containing CSV-formatted input. Must not be null. 218 * @param charset 219 * a Charset. 220 * @param format 221 * the CSVFormat used for CSV parsing. Must not be null. 222 * @return a new CSVParser configured with the given reader and format. 223 * @throws IllegalArgumentException 224 * If the parameters of the format are inconsistent or if either reader or format are null. 225 * @throws IOException 226 * If there is a problem reading the header or skipping the first record 227 * @since 1.5 228 */ 229 @SuppressWarnings("resource") 230 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 231 throws IOException { 232 Assertions.notNull(inputStream, "inputStream"); 233 Assertions.notNull(format, "format"); 234 return parse(new InputStreamReader(inputStream, charset), format); 235 } 236 237 /** 238 * Creates a parser for the given {@link Path}. 239 * 240 * @param path 241 * a CSV file. Must not be null. 242 * @param charset 243 * A Charset 244 * @param format 245 * the CSVFormat used for CSV parsing. Must not be null. 246 * @return a new parser 247 * @throws IllegalArgumentException 248 * If the parameters of the format are inconsistent or if either file or format are null. 249 * @throws IOException 250 * If an I/O error occurs 251 * @since 1.5 252 */ 253 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 254 Assertions.notNull(path, "path"); 255 Assertions.notNull(format, "format"); 256 return parse(Files.newInputStream(path), charset, format); 257 } 258 259 /** 260 * Creates a CSV parser using the given {@link CSVFormat} 261 * 262 * <p> 263 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 264 * unless you close the {@code reader}. 265 * </p> 266 * 267 * @param reader 268 * a Reader containing CSV-formatted input. Must not be null. 269 * @param format 270 * the CSVFormat used for CSV parsing. Must not be null. 271 * @return a new CSVParser configured with the given reader and format. 272 * @throws IllegalArgumentException 273 * If the parameters of the format are inconsistent or if either reader or format are null. 274 * @throws IOException 275 * If there is a problem reading the header or skipping the first record 276 * @since 1.5 277 */ 278 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { 279 return new CSVParser(reader, format); 280 } 281 282 /** 283 * Creates a parser for the given {@link String}. 284 * 285 * @param string 286 * a CSV string. Must not be null. 287 * @param format 288 * the CSVFormat used for CSV parsing. Must not be null. 289 * @return a new parser 290 * @throws IllegalArgumentException 291 * If the parameters of the format are inconsistent or if either string or format are null. 292 * @throws IOException 293 * If an I/O error occurs 294 */ 295 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 296 Assertions.notNull(string, "string"); 297 Assertions.notNull(format, "format"); 298 299 return new CSVParser(new StringReader(string), format); 300 } 301 302 // the following objects are shared to reduce garbage 303 304 /** 305 * Creates a parser for the given URL. 306 * 307 * <p> 308 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 309 * you close the {@code url}. 310 * </p> 311 * 312 * @param url 313 * a URL. Must not be null. 314 * @param charset 315 * the charset for the resource. Must not be null. 316 * @param format 317 * the CSVFormat used for CSV parsing. Must not be null. 318 * @return a new parser 319 * @throws IllegalArgumentException 320 * If the parameters of the format are inconsistent or if either url, charset or format are null. 321 * @throws IOException 322 * If an I/O error occurs 323 */ 324 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 325 Assertions.notNull(url, "url"); 326 Assertions.notNull(charset, "charset"); 327 Assertions.notNull(format, "format"); 328 329 return new CSVParser(new InputStreamReader(url.openStream(), charset), format); 330 } 331 332 private final CSVFormat format; 333 334 /** A mapping of column names to column indices */ 335 private final Map<String, Integer> headerMap; 336 337 /** The column order to avoid re-computing it. */ 338 private final List<String> headerNames; 339 340 private final Lexer lexer; 341 342 private final CSVRecordIterator csvRecordIterator; 343 344 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 345 private final List<String> recordList = new ArrayList<>(); 346 347 /** 348 * The next record number to assign. 349 */ 350 private long recordNumber; 351 352 /** 353 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 354 * with {@link #recordNumber}. 355 */ 356 private final long characterOffset; 357 358 private final Token reusableToken = new Token(); 359 360 /** 361 * Customized CSV parser using the given {@link CSVFormat} 362 * 363 * <p> 364 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 365 * unless you close the {@code reader}. 366 * </p> 367 * 368 * @param reader 369 * a Reader containing CSV-formatted input. Must not be null. 370 * @param format 371 * the CSVFormat used for CSV parsing. Must not be null. 372 * @throws IllegalArgumentException 373 * If the parameters of the format are inconsistent or if either reader or format are null. 374 * @throws IOException 375 * If there is a problem reading the header or skipping the first record 376 */ 377 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 378 this(reader, format, 0, 1); 379 } 380 381 /** 382 * Customized CSV parser using the given {@link CSVFormat} 383 * 384 * <p> 385 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 386 * unless you close the {@code reader}. 387 * </p> 388 * 389 * @param reader 390 * a Reader containing CSV-formatted input. Must not be null. 391 * @param format 392 * the CSVFormat used for CSV parsing. Must not be null. 393 * @param characterOffset 394 * Lexer offset when the parser does not start parsing at the beginning of the source. 395 * @param recordNumber 396 * The next record number to assign 397 * @throws IllegalArgumentException 398 * If the parameters of the format are inconsistent or if either reader or format are null. 399 * @throws IOException 400 * If there is a problem reading the header or skipping the first record 401 * @since 1.1 402 */ 403 @SuppressWarnings("resource") 404 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) 405 throws IOException { 406 Assertions.notNull(reader, "reader"); 407 Assertions.notNull(format, "format"); 408 409 this.format = format; 410 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); 411 this.csvRecordIterator = new CSVRecordIterator(); 412 final Headers headers = createHeaders(); 413 this.headerMap = headers.headerMap; 414 this.headerNames = headers.headerNames; 415 this.characterOffset = characterOffset; 416 this.recordNumber = recordNumber - 1; 417 } 418 419 private void addRecordValue(final boolean lastRecord) { 420 final String input = this.reusableToken.content.toString(); 421 final String inputClean = this.format.getTrim() ? input.trim() : input; 422 if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { 423 return; 424 } 425 final String nullString = this.format.getNullString(); 426 this.recordList.add(inputClean.equals(nullString) ? null : inputClean); 427 } 428 429 /** 430 * Closes resources. 431 * 432 * @throws IOException 433 * If an I/O error occurs 434 */ 435 @Override 436 public void close() throws IOException { 437 if (this.lexer != null) { 438 this.lexer.close(); 439 } 440 } 441 442 private Map<String, Integer> createEmptyHeaderMap() { 443 return this.format.getIgnoreHeaderCase() ? 444 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : 445 new LinkedHashMap<>(); 446 } 447 448 /** 449 * Header information based on name and position. 450 */ 451 private static final class Headers { 452 /** 453 * Header column positions (0-based) 454 */ 455 final Map<String, Integer> headerMap; 456 457 /** 458 * Header names in column order 459 */ 460 final List<String> headerNames; 461 462 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { 463 this.headerMap = headerMap; 464 this.headerNames = headerNames; 465 } 466 } 467 468 /** 469 * Creates the name to index mapping if the format defines a header. 470 * 471 * @return null if the format has no header. 472 * @throws IOException if there is a problem reading the header or skipping the first record 473 */ 474 private Headers createHeaders() throws IOException { 475 Map<String, Integer> hdrMap = null; 476 List<String> headerNames = null; 477 final String[] formatHeader = this.format.getHeader(); 478 if (formatHeader != null) { 479 hdrMap = createEmptyHeaderMap(); 480 String[] headerRecord = null; 481 if (formatHeader.length == 0) { 482 // read the header from the first line of the file 483 final CSVRecord nextRecord = this.nextRecord(); 484 if (nextRecord != null) { 485 headerRecord = nextRecord.values(); 486 } 487 } else { 488 if (this.format.getSkipHeaderRecord()) { 489 this.nextRecord(); 490 } 491 headerRecord = formatHeader; 492 } 493 494 // build the name to index mappings 495 if (headerRecord != null) { 496 for (int i = 0; i < headerRecord.length; i++) { 497 final String header = headerRecord[i]; 498 final boolean emptyHeader = header == null || header.trim().isEmpty(); 499 if (emptyHeader && !this.format.getAllowMissingColumnNames()) { 500 throw new IllegalArgumentException( 501 "A header name is missing in " + Arrays.toString(headerRecord)); 502 } 503 // Note: This will always allow a duplicate header if the header is empty 504 final boolean containsHeader = header != null && hdrMap.containsKey(header); 505 if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { 506 throw new IllegalArgumentException( 507 String.format( 508 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", 509 header, Arrays.toString(headerRecord))); 510 } 511 if (header != null) { 512 hdrMap.put(header, Integer.valueOf(i)); 513 if (headerNames == null) { 514 headerNames = new ArrayList<>(headerRecord.length); 515 } 516 headerNames.add(header); 517 } 518 } 519 } 520 } 521 if (headerNames == null) { 522 headerNames = Collections.emptyList(); //immutable 523 } else { 524 headerNames = Collections.unmodifiableList(headerNames); 525 } 526 return new Headers(hdrMap, headerNames); 527 } 528 529 /** 530 * Returns the current line number in the input stream. 531 * 532 * <p> 533 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 534 * the record number. 535 * </p> 536 * 537 * @return current line number 538 */ 539 public long getCurrentLineNumber() { 540 return this.lexer.getCurrentLineNumber(); 541 } 542 543 /** 544 * Gets the first end-of-line string encountered. 545 * 546 * @return the first end-of-line string 547 * @since 1.5 548 */ 549 public String getFirstEndOfLine() { 550 return lexer.getFirstEol(); 551 } 552 553 /** 554 * Returns a copy of the header map. 555 * <p> 556 * The map keys are column names. The map values are 0-based indices. 557 * </p> 558 * <p> 559 * Note: The map can only provide a one-to-one mapping when the format did not 560 * contain null or duplicate column names. 561 * </p> 562 * 563 * @return a copy of the header map. 564 */ 565 public Map<String, Integer> getHeaderMap() { 566 if (this.headerMap == null) { 567 return null; 568 } 569 final Map<String, Integer> map = createEmptyHeaderMap(); 570 map.putAll(this.headerMap); 571 return map; 572 } 573 574 /** 575 * Returns the header map. 576 * 577 * @return the header map. 578 */ 579 Map<String, Integer> getHeaderMapRaw() { 580 return this.headerMap; 581 } 582 583 /** 584 * Returns a read-only list of header names that iterates in column order. 585 * <p> 586 * Note: The list provides strings that can be used as keys in the header map. 587 * The list will not contain null column names if they were present in the input 588 * format. 589 * </p> 590 * 591 * @return read-only list of header names that iterates in column order. 592 * @see #getHeaderMap() 593 * @since 1.7 594 */ 595 public List<String> getHeaderNames() { 596 return headerNames; 597 } 598 599 /** 600 * Returns the current record number in the input stream. 601 * 602 * <p> 603 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 604 * the line number. 605 * </p> 606 * 607 * @return current record number 608 */ 609 public long getRecordNumber() { 610 return this.recordNumber; 611 } 612 613 /** 614 * Parses the CSV input according to the given format and returns the content as a list of 615 * {@link CSVRecord CSVRecords}. 616 * 617 * <p> 618 * The returned content starts at the current parse-position in the stream. 619 * </p> 620 * 621 * @return list of {@link CSVRecord CSVRecords}, may be empty 622 * @throws IOException 623 * on parse error or input read-failure 624 */ 625 public List<CSVRecord> getRecords() throws IOException { 626 CSVRecord rec; 627 final List<CSVRecord> records = new ArrayList<>(); 628 while ((rec = this.nextRecord()) != null) { 629 records.add(rec); 630 } 631 return records; 632 } 633 634 /** 635 * Gets whether this parser is closed. 636 * 637 * @return whether this parser is closed. 638 */ 639 public boolean isClosed() { 640 return this.lexer.isClosed(); 641 } 642 643 /** 644 * Returns an iterator on the records. 645 * 646 * <p> 647 * An {@link IOException} caught during the iteration are re-thrown as an 648 * {@link IllegalStateException}. 649 * </p> 650 * <p> 651 * If the parser is closed a call to {@link Iterator#next()} will throw a 652 * {@link NoSuchElementException}. 653 * </p> 654 */ 655 @Override 656 public Iterator<CSVRecord> iterator() { 657 return csvRecordIterator; 658 } 659 660 /** 661 * Parses the next record from the current point in the stream. 662 * 663 * @return the record as an array of values, or {@code null} if the end of the stream has been reached 664 * @throws IOException 665 * on parse error or input read-failure 666 */ 667 CSVRecord nextRecord() throws IOException { 668 CSVRecord result = null; 669 this.recordList.clear(); 670 StringBuilder sb = null; 671 final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; 672 do { 673 this.reusableToken.reset(); 674 this.lexer.nextToken(this.reusableToken); 675 switch (this.reusableToken.type) { 676 case TOKEN: 677 this.addRecordValue(false); 678 break; 679 case EORECORD: 680 this.addRecordValue(true); 681 break; 682 case EOF: 683 if (this.reusableToken.isReady) { 684 this.addRecordValue(true); 685 } 686 break; 687 case INVALID: 688 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); 689 case COMMENT: // Ignored currently 690 if (sb == null) { // first comment for this record 691 sb = new StringBuilder(); 692 } else { 693 sb.append(Constants.LF); 694 } 695 sb.append(this.reusableToken.content); 696 this.reusableToken.type = TOKEN; // Read another token 697 break; 698 default: 699 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); 700 } 701 } while (this.reusableToken.type == TOKEN); 702 703 if (!this.recordList.isEmpty()) { 704 this.recordNumber++; 705 final String comment = sb == null ? null : sb.toString(); 706 result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]), 707 comment, this.recordNumber, startCharPosition); 708 } 709 return result; 710 } 711 712}