001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import static org.apache.commons.csv.Token.Type.TOKEN;
021
022import java.io.Closeable;
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.Reader;
029import java.io.StringReader;
030import java.net.URL;
031import java.nio.charset.Charset;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.util.ArrayList;
035import java.util.Arrays;
036import java.util.Collections;
037import java.util.Iterator;
038import java.util.LinkedHashMap;
039import java.util.List;
040import java.util.Map;
041import java.util.NoSuchElementException;
042import java.util.TreeMap;
043
044/**
045 * Parses CSV files according to the specified format.
046 *
047 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
048 * specification of a {@link CSVFormat}.
049 *
050 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
051 *
052 * <h2>Creating instances</h2>
053 * <p>
054 * There are several static factory methods that can be used to create instances for various types of resources:
055 * </p>
056 * <ul>
057 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
058 *     <li>{@link #parse(String, CSVFormat)}</li>
059 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
060 * </ul>
061 * <p>
062 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
063 *
064 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
065 * </p>
066 * <pre>
067 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
068 *     ...
069 * }
070 * </pre>
071 *
072 * <h2>Parsing record wise</h2>
073 * <p>
074 * To parse a CSV input from a file, you write:
075 * </p>
076 *
077 * <pre>
078 * File csvData = new File(&quot;/path/to/csv&quot;);
079 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
080 * for (CSVRecord csvRecord : parser) {
081 *     ...
082 * }
083 * </pre>
084 *
085 * <p>
086 * This will read the parse the contents of the file using the
087 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
088 * </p>
089 *
090 * <p>
091 * To parse CSV input in a format like Excel, you write:
092 * </p>
093 *
094 * <pre>
095 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
096 * for (CSVRecord csvRecord : parser) {
097 *     ...
098 * }
099 * </pre>
100 *
101 * <p>
102 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
103 * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
104 * </p>
105 *
106 * <h2>Parsing into memory</h2>
107 * <p>
108 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
109 * </p>
110 *
111 * <pre>
112 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
113 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
114 * List&lt;CSVRecord&gt; list = parser.getRecords();
115 * </pre>
116 *
117 * <p>
118 * There are two constraints that have to be kept in mind:
119 * </p>
120 *
121 * <ol>
122 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
123 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
124 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
125 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
126 * </ol>
127 *
128 * <h2>Notes</h2>
129 * <p>
130 * Internal parser state is completely covered by the format and the reader-state.
131 * </p>
132 *
133 * @see <a href="package-summary.html">package documentation for more details</a>
134 */
135public final class CSVParser implements Iterable<CSVRecord>, Closeable {
136
137    class CSVRecordIterator implements Iterator<CSVRecord> {
138        private CSVRecord current;
139
140        private CSVRecord getNextRecord() {
141            try {
142                return CSVParser.this.nextRecord();
143            } catch (final IOException e) {
144                throw new IllegalStateException(
145                        e.getClass().getSimpleName() + " reading next record: " + e.toString(), e);
146            }
147        }
148
149        @Override
150        public boolean hasNext() {
151            if (CSVParser.this.isClosed()) {
152                return false;
153            }
154            if (this.current == null) {
155                this.current = this.getNextRecord();
156            }
157
158            return this.current != null;
159        }
160
161        @Override
162        public CSVRecord next() {
163            if (CSVParser.this.isClosed()) {
164                throw new NoSuchElementException("CSVParser has been closed");
165            }
166            CSVRecord next = this.current;
167            this.current = null;
168
169            if (next == null) {
170                // hasNext() wasn't called before
171                next = this.getNextRecord();
172                if (next == null) {
173                    throw new NoSuchElementException("No more CSV records available");
174                }
175            }
176
177            return next;
178        }
179
180        @Override
181        public void remove() {
182            throw new UnsupportedOperationException();
183        }
184    }
185
186    /**
187     * Creates a parser for the given {@link File}.
188     *
189     * @param file
190     *            a CSV file. Must not be null.
191     * @param charset
192     *            A Charset
193     * @param format
194     *            the CSVFormat used for CSV parsing. Must not be null.
195     * @return a new parser
196     * @throws IllegalArgumentException
197     *             If the parameters of the format are inconsistent or if either file or format are null.
198     * @throws IOException
199     *             If an I/O error occurs
200     */
201    @SuppressWarnings("resource")
202    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
203        Assertions.notNull(file, "file");
204        Assertions.notNull(format, "format");
205        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
206    }
207
208    /**
209     * Creates a CSV parser using the given {@link CSVFormat}.
210     *
211     * <p>
212     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
213     * unless you close the {@code reader}.
214     * </p>
215     *
216     * @param inputStream
217     *            an InputStream containing CSV-formatted input. Must not be null.
218     * @param charset
219     *            a Charset.
220     * @param format
221     *            the CSVFormat used for CSV parsing. Must not be null.
222     * @return a new CSVParser configured with the given reader and format.
223     * @throws IllegalArgumentException
224     *             If the parameters of the format are inconsistent or if either reader or format are null.
225     * @throws IOException
226     *             If there is a problem reading the header or skipping the first record
227     * @since 1.5
228     */
229    @SuppressWarnings("resource")
230    public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
231            throws IOException {
232        Assertions.notNull(inputStream, "inputStream");
233        Assertions.notNull(format, "format");
234        return parse(new InputStreamReader(inputStream, charset), format);
235    }
236
237    /**
238     * Creates a parser for the given {@link Path}.
239     *
240     * @param path
241     *            a CSV file. Must not be null.
242     * @param charset
243     *            A Charset
244     * @param format
245     *            the CSVFormat used for CSV parsing. Must not be null.
246     * @return a new parser
247     * @throws IllegalArgumentException
248     *             If the parameters of the format are inconsistent or if either file or format are null.
249     * @throws IOException
250     *             If an I/O error occurs
251     * @since 1.5
252     */
253    public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
254        Assertions.notNull(path, "path");
255        Assertions.notNull(format, "format");
256        return parse(Files.newInputStream(path), charset, format);
257    }
258
259    /**
260     * Creates a CSV parser using the given {@link CSVFormat}
261     *
262     * <p>
263     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
264     * unless you close the {@code reader}.
265     * </p>
266     *
267     * @param reader
268     *            a Reader containing CSV-formatted input. Must not be null.
269     * @param format
270     *            the CSVFormat used for CSV parsing. Must not be null.
271     * @return a new CSVParser configured with the given reader and format.
272     * @throws IllegalArgumentException
273     *             If the parameters of the format are inconsistent or if either reader or format are null.
274     * @throws IOException
275     *             If there is a problem reading the header or skipping the first record
276     * @since 1.5
277     */
278    public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
279        return new CSVParser(reader, format);
280    }
281
282    /**
283     * Creates a parser for the given {@link String}.
284     *
285     * @param string
286     *            a CSV string. Must not be null.
287     * @param format
288     *            the CSVFormat used for CSV parsing. Must not be null.
289     * @return a new parser
290     * @throws IllegalArgumentException
291     *             If the parameters of the format are inconsistent or if either string or format are null.
292     * @throws IOException
293     *             If an I/O error occurs
294     */
295    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
296        Assertions.notNull(string, "string");
297        Assertions.notNull(format, "format");
298
299        return new CSVParser(new StringReader(string), format);
300    }
301
302    // the following objects are shared to reduce garbage
303
304    /**
305     * Creates a parser for the given URL.
306     *
307     * <p>
308     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
309     * you close the {@code url}.
310     * </p>
311     *
312     * @param url
313     *            a URL. Must not be null.
314     * @param charset
315     *            the charset for the resource. Must not be null.
316     * @param format
317     *            the CSVFormat used for CSV parsing. Must not be null.
318     * @return a new parser
319     * @throws IllegalArgumentException
320     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
321     * @throws IOException
322     *             If an I/O error occurs
323     */
324    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
325        Assertions.notNull(url, "url");
326        Assertions.notNull(charset, "charset");
327        Assertions.notNull(format, "format");
328
329        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
330    }
331
332    private final CSVFormat format;
333
334    /** A mapping of column names to column indices */
335    private final Map<String, Integer> headerMap;
336
337    /** The column order to avoid re-computing it. */
338    private final List<String> headerNames;
339
340    private final Lexer lexer;
341
342    private final CSVRecordIterator csvRecordIterator;
343
344    /** A record buffer for getRecord(). Grows as necessary and is reused. */
345    private final List<String> recordList = new ArrayList<>();
346
347    /**
348     * The next record number to assign.
349     */
350    private long recordNumber;
351
352    /**
353     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
354     * with {@link #recordNumber}.
355     */
356    private final long characterOffset;
357
358    private final Token reusableToken = new Token();
359
360    /**
361     * Customized CSV parser using the given {@link CSVFormat}
362     *
363     * <p>
364     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
365     * unless you close the {@code reader}.
366     * </p>
367     *
368     * @param reader
369     *            a Reader containing CSV-formatted input. Must not be null.
370     * @param format
371     *            the CSVFormat used for CSV parsing. Must not be null.
372     * @throws IllegalArgumentException
373     *             If the parameters of the format are inconsistent or if either reader or format are null.
374     * @throws IOException
375     *             If there is a problem reading the header or skipping the first record
376     */
377    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
378        this(reader, format, 0, 1);
379    }
380
381    /**
382     * Customized CSV parser using the given {@link CSVFormat}
383     *
384     * <p>
385     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
386     * unless you close the {@code reader}.
387     * </p>
388     *
389     * @param reader
390     *            a Reader containing CSV-formatted input. Must not be null.
391     * @param format
392     *            the CSVFormat used for CSV parsing. Must not be null.
393     * @param characterOffset
394     *            Lexer offset when the parser does not start parsing at the beginning of the source.
395     * @param recordNumber
396     *            The next record number to assign
397     * @throws IllegalArgumentException
398     *             If the parameters of the format are inconsistent or if either reader or format are null.
399     * @throws IOException
400     *             If there is a problem reading the header or skipping the first record
401     * @since 1.1
402     */
403    @SuppressWarnings("resource")
404    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
405        throws IOException {
406        Assertions.notNull(reader, "reader");
407        Assertions.notNull(format, "format");
408
409        this.format = format;
410        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
411        this.csvRecordIterator = new CSVRecordIterator();
412        final Headers headers = createHeaders();
413        this.headerMap = headers.headerMap;
414        this.headerNames = headers.headerNames;
415        this.characterOffset = characterOffset;
416        this.recordNumber = recordNumber - 1;
417    }
418
419    private void addRecordValue(final boolean lastRecord) {
420        final String input = this.reusableToken.content.toString();
421        final String inputClean = this.format.getTrim() ? input.trim() : input;
422        if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
423            return;
424        }
425        final String nullString = this.format.getNullString();
426        this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
427    }
428
429    /**
430     * Closes resources.
431     *
432     * @throws IOException
433     *             If an I/O error occurs
434     */
435    @Override
436    public void close() throws IOException {
437        if (this.lexer != null) {
438            this.lexer.close();
439        }
440    }
441
442    private Map<String, Integer> createEmptyHeaderMap() {
443        return this.format.getIgnoreHeaderCase() ?
444                new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
445                new LinkedHashMap<>();
446    }
447
448    /**
449     * Header information based on name and position.
450     */
451    private static final class Headers {
452        /**
453         * Header column positions (0-based)
454         */
455        final Map<String, Integer> headerMap;
456
457        /**
458         * Header names in column order
459         */
460        final List<String> headerNames;
461
462        Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
463            this.headerMap = headerMap;
464            this.headerNames = headerNames;
465        }
466    }
467
468    /**
469     * Creates the name to index mapping if the format defines a header.
470     *
471     * @return null if the format has no header.
472     * @throws IOException if there is a problem reading the header or skipping the first record
473     */
474    private Headers createHeaders() throws IOException {
475        Map<String, Integer> hdrMap = null;
476        List<String> headerNames = null;
477        final String[] formatHeader = this.format.getHeader();
478        if (formatHeader != null) {
479            hdrMap = createEmptyHeaderMap();
480            String[] headerRecord = null;
481            if (formatHeader.length == 0) {
482                // read the header from the first line of the file
483                final CSVRecord nextRecord = this.nextRecord();
484                if (nextRecord != null) {
485                    headerRecord = nextRecord.values();
486                }
487            } else {
488                if (this.format.getSkipHeaderRecord()) {
489                    this.nextRecord();
490                }
491                headerRecord = formatHeader;
492            }
493
494            // build the name to index mappings
495            if (headerRecord != null) {
496                for (int i = 0; i < headerRecord.length; i++) {
497                    final String header = headerRecord[i];
498                    final boolean emptyHeader = header == null || header.trim().isEmpty();
499                    if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
500                        throw new IllegalArgumentException(
501                            "A header name is missing in " + Arrays.toString(headerRecord));
502                    }
503                    // Note: This will always allow a duplicate header if the header is empty
504                    final boolean containsHeader = header != null && hdrMap.containsKey(header);
505                    if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
506                        throw new IllegalArgumentException(
507                            String.format(
508                                "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
509                                header, Arrays.toString(headerRecord)));
510                    }
511                    if (header != null) {
512                        hdrMap.put(header, Integer.valueOf(i));
513                        if (headerNames == null) {
514                            headerNames = new ArrayList<>(headerRecord.length);
515                        }
516                        headerNames.add(header);
517                    }
518                }
519            }
520        }
521        if (headerNames == null) {
522            headerNames = Collections.emptyList(); //immutable
523        } else {
524            headerNames = Collections.unmodifiableList(headerNames);
525        }
526        return new Headers(hdrMap, headerNames);
527    }
528
529    /**
530     * Returns the current line number in the input stream.
531     *
532     * <p>
533     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
534     * the record number.
535     * </p>
536     *
537     * @return current line number
538     */
539    public long getCurrentLineNumber() {
540        return this.lexer.getCurrentLineNumber();
541    }
542
543    /**
544     * Gets the first end-of-line string encountered.
545     *
546     * @return the first end-of-line string
547     * @since 1.5
548     */
549    public String getFirstEndOfLine() {
550        return lexer.getFirstEol();
551    }
552
553    /**
554     * Returns a copy of the header map.
555     * <p>
556     * The map keys are column names. The map values are 0-based indices.
557     * </p>
558     * <p>
559     * Note: The map can only provide a one-to-one mapping when the format did not
560     * contain null or duplicate column names.
561     * </p>
562     *
563     * @return a copy of the header map.
564     */
565    public Map<String, Integer> getHeaderMap() {
566        if (this.headerMap == null) {
567            return null;
568        }
569        final Map<String, Integer> map = createEmptyHeaderMap();
570        map.putAll(this.headerMap);
571        return map;
572    }
573
574    /**
575     * Returns the header map.
576     *
577     * @return the header map.
578     */
579    Map<String, Integer> getHeaderMapRaw() {
580        return this.headerMap;
581    }
582
583    /**
584     * Returns a read-only list of header names that iterates in column order.
585     * <p>
586     * Note: The list provides strings that can be used as keys in the header map.
587     * The list will not contain null column names if they were present in the input
588     * format.
589     * </p>
590     *
591     * @return read-only list of header names that iterates in column order.
592     * @see #getHeaderMap()
593     * @since 1.7
594     */
595    public List<String> getHeaderNames() {
596        return headerNames;
597    }
598
599    /**
600     * Returns the current record number in the input stream.
601     *
602     * <p>
603     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
604     * the line number.
605     * </p>
606     *
607     * @return current record number
608     */
609    public long getRecordNumber() {
610        return this.recordNumber;
611    }
612
613    /**
614     * Parses the CSV input according to the given format and returns the content as a list of
615     * {@link CSVRecord CSVRecords}.
616     *
617     * <p>
618     * The returned content starts at the current parse-position in the stream.
619     * </p>
620     *
621     * @return list of {@link CSVRecord CSVRecords}, may be empty
622     * @throws IOException
623     *             on parse error or input read-failure
624     */
625    public List<CSVRecord> getRecords() throws IOException {
626        CSVRecord rec;
627        final List<CSVRecord> records = new ArrayList<>();
628        while ((rec = this.nextRecord()) != null) {
629            records.add(rec);
630        }
631        return records;
632    }
633
634    /**
635     * Gets whether this parser is closed.
636     *
637     * @return whether this parser is closed.
638     */
639    public boolean isClosed() {
640        return this.lexer.isClosed();
641    }
642
643    /**
644     * Returns an iterator on the records.
645     *
646     * <p>
647     * An {@link IOException} caught during the iteration are re-thrown as an
648     * {@link IllegalStateException}.
649     * </p>
650     * <p>
651     * If the parser is closed a call to {@link Iterator#next()} will throw a
652     * {@link NoSuchElementException}.
653     * </p>
654     */
655    @Override
656    public Iterator<CSVRecord> iterator() {
657        return csvRecordIterator;
658    }
659
660    /**
661     * Parses the next record from the current point in the stream.
662     *
663     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
664     * @throws IOException
665     *             on parse error or input read-failure
666     */
667    CSVRecord nextRecord() throws IOException {
668        CSVRecord result = null;
669        this.recordList.clear();
670        StringBuilder sb = null;
671        final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
672        do {
673            this.reusableToken.reset();
674            this.lexer.nextToken(this.reusableToken);
675            switch (this.reusableToken.type) {
676            case TOKEN:
677                this.addRecordValue(false);
678                break;
679            case EORECORD:
680                this.addRecordValue(true);
681                break;
682            case EOF:
683                if (this.reusableToken.isReady) {
684                    this.addRecordValue(true);
685                }
686                break;
687            case INVALID:
688                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
689            case COMMENT: // Ignored currently
690                if (sb == null) { // first comment for this record
691                    sb = new StringBuilder();
692                } else {
693                    sb.append(Constants.LF);
694                }
695                sb.append(this.reusableToken.content);
696                this.reusableToken.type = TOKEN; // Read another token
697                break;
698            default:
699                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
700            }
701        } while (this.reusableToken.type == TOKEN);
702
703        if (!this.recordList.isEmpty()) {
704            this.recordNumber++;
705            final String comment = sb == null ? null : sb.toString();
706            result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]),
707                comment, this.recordNumber, startCharPosition);
708        }
709        return result;
710    }
711
712}