METAMODEL-250: Added support for EBCDIC files
authorKasper Sørensen <i.am.kasper.sorensen@gmail.com>
Tue, 2 Aug 2016 04:19:22 +0000 (21:19 -0700)
committerKasper Sørensen <i.am.kasper.sorensen@gmail.com>
Tue, 2 Aug 2016 04:20:11 +0000 (21:20 -0700)
Closes #103

14 files changed:
CHANGES.md
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java [new file with mode: 0644]
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java [new file with mode: 0644]
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthColumnSpec.java
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfiguration.java
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationReader.java
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataContext.java
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthDataSet.java
fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/FixedWidthReader.java
fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java [new file with mode: 0644]
fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthConfigurationTest.java
fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthDataContextTest.java
fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/FixedWidthReaderTest.java
fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc [new file with mode: 0644]

index f0264c6..c0b90cc 100644 (file)
@@ -2,6 +2,7 @@
 
  * [METAMODEL-1099] - Created a new DataContextFactory SPI and a extensible registry of implementations based on ServiceLoader.
  * [METAMODEL-1099] - Implemented DataContextFactory SPI for connectors: JDBC, CSV, ElasticSearch
+ * [METAMODEL-250] - Added support for EBCDIC files (part of 'fixedwidth' module).
  * [METAMODEL-1103] - Fixed a bug pertaining to anchoring of wildcards in LIKE operands.
  * [METAMODEL-1088] - Add support for aliases in MongoDB.
  * [METAMODEL-1086] - Fixed encoding issue when CsvDataContext is instantiated with InputStream.
diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicConfiguration.java
new file mode 100644 (file)
index 0000000..389a4f8
--- /dev/null
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.metamodel.fixedwidth;
+
+/**
+ * Special fixed-width configuration for EBCDIC files. 
+ */
+public final class EbcdicConfiguration extends FixedWidthConfiguration {
+
+    private final boolean _skipEbcdicHeader;
+    private final boolean _eolPresent;
+
+    public EbcdicConfiguration(int columnNameLineNumber, String encoding, int fixedValueWidth,
+            boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) {
+        super(columnNameLineNumber, encoding, fixedValueWidth, failOnInconsistentLineWidth);
+        _skipEbcdicHeader = skipEbcdicHeader;
+        _eolPresent = eolPresent;
+    }
+
+    public EbcdicConfiguration(int columnNameLineNumber, String encoding, int[] valueWidths,
+            boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) {
+        super(columnNameLineNumber, null, encoding, valueWidths, failOnInconsistentLineWidth);
+        _skipEbcdicHeader = skipEbcdicHeader;
+        _eolPresent = eolPresent;
+    }
+
+    /**
+     * Determines if the input file contains a header that should be skipped before reading records data.
+     *
+     * @return a boolean indicating whether or not to skip EBCDIC header.
+     */
+    public boolean isSkipEbcdicHeader() {
+        return _skipEbcdicHeader;
+    }
+
+    /**
+     * Determines if the input file contains new line characters.
+     *
+     * @return a boolean indicating whether or not the input contains new line characters.
+     */
+    public boolean isEolPresent() {
+        return _eolPresent;
+    }
+}
diff --git a/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java b/fixedwidth/src/main/java/org/apache/metamodel/fixedwidth/EbcdicReader.java
new file mode 100644 (file)
index 0000000..a7639fc
--- /dev/null
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.metamodel.fixedwidth;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+
+/**
+ * Reader capable of separating values based on a fixed width setting.
+ */
+class EbcdicReader extends FixedWidthReader {
+
+    private final boolean _skipEbcdicHeader;
+    private final boolean _eolPresent;
+    private boolean _headerSkipped;
+    
+    public EbcdicReader(BufferedInputStream stream, String charsetName, int[] valueWidths,
+            boolean failOnInconsistentLineWidth, boolean skipEbcdicHeader, boolean eolPresent) {
+        super(stream, charsetName, valueWidths, failOnInconsistentLineWidth);
+        _skipEbcdicHeader = skipEbcdicHeader;
+        _eolPresent = eolPresent;
+    }
+
+    @Override
+    protected void beforeReadLine() {
+        if (shouldSkipHeader()) {
+            try {
+                skipHeader();
+            } catch (IOException e) {
+                throw new IllegalStateException("A problem occurred while skipping the input stream. ", e); 
+            }
+        }
+    }
+
+    private boolean shouldSkipHeader() {
+        return (_skipEbcdicHeader && !_headerSkipped);
+    }
+
+    private void skipHeader() throws IOException {
+        _headerSkipped = true;
+        _stream.skip(_expectedLineLength);
+    }
+
+    @Override
+    protected String readSingleRecordData() throws IOException {
+        if (_eolPresent) {
+            return super.readSingleRecordData();
+        } else {
+            byte[] buffer = new byte[_expectedLineLength];
+            int bytesRead = _stream.read(buffer, 0, _expectedLineLength);
+
+            if (bytesRead < 0) {
+                return null;
+            }
+
+            return new String(buffer, _charsetName);
+        } 
+    }
+}
index 65ec219..dedfbcd 100644 (file)
@@ -24,7 +24,7 @@ import org.apache.metamodel.util.HasName;
  * Represents the specification of a single column for a\r
  * {@link FixedWidthDataContext}.\r
  */\r
-public final class FixedWidthColumnSpec implements HasName {\r
+final class FixedWidthColumnSpec implements HasName {\r
 \r
     private final String name;\r
     private final int width;\r
index 2b2cae5..c53ff16 100644 (file)
@@ -31,32 +31,29 @@ import org.apache.metamodel.util.FileHelper;
 import org.apache.metamodel.util.HasNameMapper;\r
 \r
 /**\r
- * Configuration of metadata about a fixed width values datacontext.\r
+ * Configuration of metadata about a fixed width values data context.\r
  */\r
-public final class FixedWidthConfiguration extends BaseObject implements\r
-               Serializable {\r
+public class FixedWidthConfiguration extends BaseObject implements Serializable {\r
 \r
-       private static final long serialVersionUID = 1L;\r
+    private static final long serialVersionUID = 1L;\r
 \r
-       public static final int NO_COLUMN_NAME_LINE = 0;\r
-       public static final int DEFAULT_COLUMN_NAME_LINE = 1;\r
+    public static final int NO_COLUMN_NAME_LINE = 0;\r
+    public static final int DEFAULT_COLUMN_NAME_LINE = 1;\r
 \r
-       private final String encoding;\r
-       private final int fixedValueWidth;\r
-       private final int[] valueWidths;\r
-       private final int columnNameLineNumber;\r
-       private final boolean failOnInconsistentLineWidth;\r
-       private final ColumnNamingStrategy columnNamingStrategy;\r
+    private final String encoding;\r
+    private final int fixedValueWidth;\r
+    private final int[] valueWidths;\r
+    private final int columnNameLineNumber;\r
+    private final boolean failOnInconsistentLineWidth;\r
+    private final ColumnNamingStrategy columnNamingStrategy;\r
 \r
-       public FixedWidthConfiguration(int fixedValueWidth) {\r
-               this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING,\r
-                               fixedValueWidth);\r
-       }\r
+    public FixedWidthConfiguration(int fixedValueWidth) {\r
+        this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, fixedValueWidth);\r
+    }\r
 \r
-       public FixedWidthConfiguration(int[] valueWidth) {\r
-               this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, valueWidth,\r
-                               false);\r
-       }\r
+    public FixedWidthConfiguration(int[] valueWidth) {\r
+        this(DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, valueWidth, false);\r
+    }\r
 \r
     public FixedWidthConfiguration(int columnNameLineNumber, String encoding, int fixedValueWidth) {\r
         this(columnNameLineNumber, encoding, fixedValueWidth, false);\r
@@ -72,11 +69,11 @@ public final class FixedWidthConfiguration extends BaseObject implements
         this.valueWidths = new int[0];\r
     }\r
 \r
-    public FixedWidthConfiguration(int columnNameLineNumber, String encoding,\r
-            int[] valueWidths, boolean failOnInconsistentLineWidth) {\r
+    public FixedWidthConfiguration(int columnNameLineNumber, String encoding, int[] valueWidths, \r
+            boolean failOnInconsistentLineWidth) {\r
         this(columnNameLineNumber, null, encoding, valueWidths, failOnInconsistentLineWidth);\r
     }\r
-    \r
+\r
     public FixedWidthConfiguration(int columnNameLineNumber, ColumnNamingStrategy columnNamingStrategy, String encoding,\r
             int[] valueWidths, boolean failOnInconsistentLineWidth) {\r
         this.encoding = encoding;\r
@@ -86,7 +83,7 @@ public final class FixedWidthConfiguration extends BaseObject implements
         this.columnNamingStrategy = columnNamingStrategy;\r
         this.valueWidths = valueWidths;\r
     }\r
-    \r
+\r
     public FixedWidthConfiguration(String encoding, List<FixedWidthColumnSpec> columnSpecs) {\r
         this(encoding, columnSpecs, false);\r
     }\r
@@ -106,84 +103,84 @@ public final class FixedWidthConfiguration extends BaseObject implements
     }\r
 \r
     /**\r
-        * The line number (1 based) from which to get the names of the columns.\r
-        * \r
-        * @return an int representing the line number of the column headers/names.\r
-        */\r
-       public int getColumnNameLineNumber() {\r
-               return columnNameLineNumber;\r
-       }\r
-       \r
-       /**\r
-        * Gets a {@link ColumnNamingStrategy} to use if needed.\r
-        * @return\r
-        */\r
-       public ColumnNamingStrategy getColumnNamingStrategy() {\r
-           if (columnNamingStrategy == null) {\r
-               return ColumnNamingStrategies.defaultStrategy();\r
-           }\r
+     * The line number (1 based) from which to get the names of the columns.\r
+     *\r
+     * @return an int representing the line number of the column headers/names.\r
+     */\r
+    public int getColumnNameLineNumber() {\r
+        return columnNameLineNumber;\r
+    }\r
+\r
+    /**\r
+     * Gets a {@link ColumnNamingStrategy} to use if needed.\r
+     * @return column naming strategy\r
+     */\r
+    public ColumnNamingStrategy getColumnNamingStrategy() {\r
+        if (columnNamingStrategy == null) {\r
+            return ColumnNamingStrategies.defaultStrategy();\r
+        }\r
         return columnNamingStrategy;\r
     }\r
 \r
-       /**\r
-        * Gets the file encoding to use for reading the file.\r
-        * \r
-        * @return the text encoding to use for reading the file.\r
-        */\r
-       public String getEncoding() {\r
-               return encoding;\r
-       }\r
-\r
-       /**\r
-        * Gets the width of each value within the fixed width value file.\r
-        * \r
-        * @return the fixed width to use when parsing the file.\r
-        */\r
-       public int getFixedValueWidth() {\r
-               return fixedValueWidth;\r
-       }\r
-\r
-       public int[] getValueWidths() {\r
-               return valueWidths;\r
-       }\r
-\r
-       /**\r
-        * Determines if the {@link DataSet#next()} should throw an exception in\r
-        * case of inconsistent line width in the fixed width value file.\r
-        * \r
-        * @return a boolean indicating whether or not to fail on inconsistent line\r
-        *         widths.\r
-        */\r
-       public boolean isFailOnInconsistentLineWidth() {\r
-               return failOnInconsistentLineWidth;\r
-       }\r
-\r
-       @Override\r
-       protected void decorateIdentity(List<Object> identifiers) {\r
-               identifiers.add(columnNameLineNumber);\r
-               identifiers.add(encoding);\r
-               identifiers.add(fixedValueWidth);\r
-               identifiers.add(valueWidths);\r
-               identifiers.add(failOnInconsistentLineWidth);\r
-       }\r
-\r
-       @Override\r
-       public String toString() {\r
-               return "FixedWidthConfiguration[encoding=" + encoding\r
-                               + ", fixedValueWidth=" + fixedValueWidth + ", valueWidths="\r
-                               + Arrays.toString(valueWidths) + ", columnNameLineNumber="\r
-                               + columnNameLineNumber + ", failOnInconsistentLineWidth="\r
-                               + failOnInconsistentLineWidth + "]";\r
-       }\r
-\r
-       public boolean isConstantValueWidth() {\r
-               return fixedValueWidth != -1;\r
-       }\r
-\r
-       public int getValueWidth(int columnIndex) {\r
-               if (isConstantValueWidth()) {\r
-                       return fixedValueWidth;\r
-               }\r
-               return valueWidths[columnIndex];\r
-       }\r
+    /**\r
+     * Gets the file encoding to use for reading the file.\r
+     *\r
+     * @return the text encoding to use for reading the file.\r
+     */\r
+    public String getEncoding() {\r
+        return encoding;\r
+    }\r
+\r
+    /**\r
+     * Gets the width of each value within the fixed width value file.\r
+     *\r
+     * @return the fixed width to use when parsing the file.\r
+     */\r
+    public int getFixedValueWidth() {\r
+        return fixedValueWidth;\r
+    }\r
+\r
+    public int[] getValueWidths() {\r
+        return valueWidths;\r
+    }\r
+\r
+    /**\r
+     * Determines if the {@link DataSet#next()} should throw an exception in\r
+     * case of inconsistent line width in the fixed width value file.\r
+     *\r
+     * @return a boolean indicating whether or not to fail on inconsistent line\r
+     *         widths.\r
+     */\r
+    public boolean isFailOnInconsistentLineWidth() {\r
+        return failOnInconsistentLineWidth;\r
+    }\r
+\r
+    @Override\r
+    protected void decorateIdentity(List<Object> identifiers) {\r
+        identifiers.add(columnNameLineNumber);\r
+        identifiers.add(encoding);\r
+        identifiers.add(fixedValueWidth);\r
+        identifiers.add(valueWidths);\r
+        identifiers.add(failOnInconsistentLineWidth);\r
+    }\r
+\r
+    @Override\r
+    public String toString() {\r
+        return "FixedWidthConfiguration[encoding=" + encoding\r
+                + ", fixedValueWidth=" + fixedValueWidth + ", valueWidths="\r
+                + Arrays.toString(valueWidths) + ", columnNameLineNumber="\r
+                + columnNameLineNumber + ", failOnInconsistentLineWidth="\r
+                + failOnInconsistentLineWidth + "]";\r
+    }\r
+\r
+    public boolean isConstantValueWidth() {\r
+        return fixedValueWidth != -1;\r
+    }\r
+\r
+    public int getValueWidth(int columnIndex) {\r
+        if (isConstantValueWidth()) {\r
+            return fixedValueWidth;\r
+        }\r
+        return valueWidths[columnIndex];\r
+    }\r
 }\r
index 9154e5e..264287f 100644 (file)
@@ -60,10 +60,9 @@ public class FixedWidthConfigurationReader {
      * "http://support.sas.com/documentation/cdl/en/etlug/67323/HTML/default/viewer.htm#p0h03yig7fp1qan1arghp3lwjqi6.htm">\r
      * described here</a>.\r
      * \r
-     * @param encoding\r
-     * @param resource\r
-     *            the format file resource\r
-     * @param failOnInconsistentLineWidth\r
+     * @param encoding the format file encoding\r
+     * @param resource the format file resource \r
+     * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not\r
      * @return a {@link FixedWidthConfiguration} object to use\r
      */\r
     public FixedWidthConfiguration readFromSasFormatFile(String encoding, Resource resource,\r
@@ -88,13 +87,11 @@ public class FixedWidthConfigurationReader {
 \r
     /**\r
      * Reads a {@link FixedWidthConfiguration} based on a SAS INPUT declaration.\r
-     * The reader method also optionally will look for a LABEL defintion for\r
-     * column naming.\r
+     * The reader method also optionally will look for a LABEL definition for column naming.\r
      * \r
-     * @param encoding\r
-     * @param resource\r
-     *            the format file resource\r
-     * @param failOnInconsistentLineWidth\r
+     * @param encoding the format file encoding\r
+     * @param resource the format file resource\r
+     * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not\r
      * @return a {@link FixedWidthConfiguration} object to use\r
      */\r
     public FixedWidthConfiguration readFromSasInputDefinition(String encoding, Resource resource,\r
@@ -176,5 +173,4 @@ public class FixedWidthConfigurationReader {
 \r
         return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth);\r
     }\r
-\r
 }\r
index d28a0b2..027cdab 100644 (file)
@@ -18,9 +18,9 @@
  */
 package org.apache.metamodel.fixedwidth;
 
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.InputStream;
-import java.io.Reader;
 
 import org.apache.metamodel.MetaModelException;
 import org.apache.metamodel.QueryPostprocessDataContext;
@@ -106,7 +106,7 @@ public class FixedWidthDataContext extends QueryPostprocessDataContext {
     /**
      * Gets the resource being read
      * 
-     * @return
+     * @return a {@link Resource} object
      */
     public Resource getResource() {
         return _resource;
@@ -184,16 +184,23 @@ public class FixedWidthDataContext extends QueryPostprocessDataContext {
 
     private FixedWidthReader createReader() {
         final InputStream inputStream = _resource.read();
-        final Reader fileReader = FileHelper.getReader(inputStream, _configuration.getEncoding());
         final FixedWidthReader reader;
-        if (_configuration.isConstantValueWidth()) {
-            reader = new FixedWidthReader(fileReader, _configuration.getFixedValueWidth(), _configuration
-                    .isFailOnInconsistentLineWidth());
+        
+        if (_configuration instanceof EbcdicConfiguration) {
+            reader = new EbcdicReader((BufferedInputStream) inputStream, _configuration.getEncoding(),
+                    _configuration.getValueWidths(), _configuration.isFailOnInconsistentLineWidth(), 
+                    ((EbcdicConfiguration) _configuration).isSkipEbcdicHeader(), 
+                    ((EbcdicConfiguration) _configuration).isEolPresent());
         } else {
-            reader = new FixedWidthReader(fileReader, _configuration.getValueWidths(), _configuration
-                    .isFailOnInconsistentLineWidth());
+            if (_configuration.isConstantValueWidth()) {
+                reader = new FixedWidthReader(inputStream, _configuration.getEncoding(),
+                        _configuration.getFixedValueWidth(), _configuration.isFailOnInconsistentLineWidth());
+            } else {
+                reader = new FixedWidthReader(inputStream, _configuration.getEncoding(), 
+                        _configuration.getValueWidths(), _configuration.isFailOnInconsistentLineWidth());
+            }
         }
+
         return reader;
     }
-
 }
index 44ce808..4f78bab 100644 (file)
@@ -98,8 +98,7 @@ class FixedWidthDataSet extends AbstractDataSet {
                        if (columnNumber < stringValues.length) {
                                rowValues[i] = stringValues[columnNumber];
                        } else {
-                               // Ticket #125: Missing values should be enterpreted as
-                               // null.
+                               // Ticket #125: Missing values should be interpreted as null.
                                rowValues[i] = null;
                        }
                }
index d7a18cf..da17ff1 100644 (file)
  */
 package org.apache.metamodel.fixedwidth;
 
-import java.io.BufferedReader;
+import java.io.BufferedInputStream;
 import java.io.Closeable;
 import java.io.IOException;
-import java.io.Reader;
+import java.io.InputStream;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.ArrayList;
+import java.util.List;
 
 /**
  * Reader capable of separating values based on a fixed width setting.
  */
-final public class FixedWidthReader implements Closeable {
-
-       private final BufferedReader _reader;
-       private final FixedWidthLineParser _parser; 
-
-       public FixedWidthReader(Reader reader, int fixedValueWidth,
-                       boolean failOnInconsistentLineWidth) {
-               this(new BufferedReader(reader), fixedValueWidth,
-                               failOnInconsistentLineWidth);
-       }
-
-       public FixedWidthReader(BufferedReader reader, int fixedValueWidth,
-                       boolean failOnInconsistentLineWidth) {
-               _reader = reader;
-        final FixedWidthConfiguration fixedWidthConfiguration = new FixedWidthConfiguration(
-                FixedWidthConfiguration.NO_COLUMN_NAME_LINE, null, fixedValueWidth, failOnInconsistentLineWidth);
-        _parser = new FixedWidthLineParser(fixedWidthConfiguration, -1, 0);
-       }
-
-       public FixedWidthReader(Reader reader, int[] valueWidths,
-                       boolean failOnInconsistentLineWidth) {
-               this(new BufferedReader(reader), valueWidths,
-                               failOnInconsistentLineWidth);
-       }
-
-       public FixedWidthReader(BufferedReader reader, int[] valueWidths,
-                       boolean failOnInconsistentLineWidth) {
-               _reader = reader;
-               int fixedValueWidth = -1;
-               int expectedLineLength = 0;
-               if (fixedValueWidth == -1) {
-                       for (int i = 0; i < valueWidths.length; i++) {
-                               expectedLineLength += valueWidths[i];
-                       }
-               }
-        final FixedWidthConfiguration fixedWidthConfiguration = new FixedWidthConfiguration(
-                FixedWidthConfiguration.NO_COLUMN_NAME_LINE, null, valueWidths, failOnInconsistentLineWidth);
-        _parser = new FixedWidthLineParser(fixedWidthConfiguration, expectedLineLength, 0);
-       }
-
-       
-       /***
-        * Reads the next line in the file.
-        * 
-        * @return an array of values in the next line, or null if the end of the
-        *         file has been reached.
-        * 
-        * @throws IllegalStateException
-        *             if an exception occurs while reading the file.
-        */
-       public String[] readLine() throws IllegalStateException {
-        String line;
+class FixedWidthReader implements Closeable {
+    private static final int END_OF_STREAM = -1;
+    private static final int LINE_FEED = '\n';
+    private static final int CARRIAGE_RETURN = '\r';
+    
+    protected final String _charsetName;
+    private final int _fixedValueWidth;
+    private final int[] _valueWidths;
+    private int _valueIndex = 0;
+    private final boolean _failOnInconsistentLineWidth;
+    private final boolean _constantWidth;
+    private volatile int _rowNumber;
+    protected final BufferedInputStream _stream;
+    protected final int _expectedLineLength;
+
+    public FixedWidthReader(InputStream stream, String charsetName, int fixedValueWidth,
+            boolean failOnInconsistentLineWidth) {
+        this(new BufferedInputStream(stream), charsetName, fixedValueWidth, failOnInconsistentLineWidth);
+    }
+
+    private FixedWidthReader(BufferedInputStream stream, String charsetName, int fixedValueWidth,
+            boolean failOnInconsistentLineWidth) {
+        _stream = stream;
+        _charsetName = charsetName;
+        _fixedValueWidth = fixedValueWidth;
+        _failOnInconsistentLineWidth = failOnInconsistentLineWidth;
+        _rowNumber = 0;
+        _valueWidths = null;
+        _constantWidth = true;
+        _expectedLineLength = -1;
+    }
+
+    public FixedWidthReader(InputStream stream, String charsetName, int[] valueWidths,
+            boolean failOnInconsistentLineWidth) {
+        this(new BufferedInputStream(stream), charsetName, valueWidths, failOnInconsistentLineWidth);
+    }
+
+    FixedWidthReader(BufferedInputStream stream, String charsetName, int[] valueWidths,
+            boolean failOnInconsistentLineWidth) {
+        _stream = stream;
+        _charsetName = charsetName;
+        _fixedValueWidth = -1;
+        _valueWidths = valueWidths;
+        _failOnInconsistentLineWidth = failOnInconsistentLineWidth;
+        _rowNumber = 0;
+        _constantWidth = false;
+        int expectedLineLength = 0;
+
+        for (final int _valueWidth : _valueWidths) {
+            expectedLineLength += _valueWidth;
+        }
+
+        _expectedLineLength = expectedLineLength;
+    }
+
+    /**
+     * This reads and returns the next record from the file. Usually, it is a line but in case the new line characters
+     * are not present, the length of the content depends on the column-widths setting.
+     *
+     * @return an array of values in the next line, or null if the end of the file has been reached.
+     * @throws IllegalStateException if an exception occurs while reading the file.
+     */
+    public String[] readLine() throws IllegalStateException {
         try {
-            line = _reader.readLine();
-            return _parser.parseLine(line);
+            beforeReadLine();
+            _rowNumber++;
+            return getValues();
         } catch (IOException e) {
             throw new IllegalStateException(e);
         }
-       }
-       
+    }
+
+    /**
+     * Empty hook that enables special behavior in sub-classed readers (by overriding this method). 
+     */
+    protected void beforeReadLine() {
+        return;
+    }
+
+    private String[] getValues() throws IOException {
+        final List<String> values = new ArrayList<>();
+        final String singleRecordData = readSingleRecordData();
+
+        if (singleRecordData == null) {
+            return null;
+        }
+
+        processSingleRecordData(singleRecordData, values);
+        String[] result = values.toArray(new String[values.size()]);
+
+        if (!_failOnInconsistentLineWidth && !_constantWidth) {
+            result = correctResult(result);
+        }
+
+        validateConsistentValue(singleRecordData, result, values.size());
+
+        return result;
+    }
+
+    private void validateConsistentValue(String recordData, String[] result, int valuesSize) {
+        if (!_failOnInconsistentLineWidth) {
+            return;
+        }
+
+        InconsistentValueWidthException inconsistentValueException = null;
+
+        if (_constantWidth) {
+            if (recordData.length() % _fixedValueWidth != 0) {
+                inconsistentValueException = new InconsistentValueWidthException(result, recordData, _rowNumber);
+            }
+        } else if (result.length != valuesSize || recordData.length() != _expectedLineLength) {
+            inconsistentValueException = new InconsistentValueWidthException(result, recordData, _rowNumber);
+        }
+
+        if (inconsistentValueException != null) {
+            throw inconsistentValueException;
+        }
+    }
+
+    private void processSingleRecordData(final String singleRecordData, final List<String> values) {
+        StringBuilder nextValue = new StringBuilder();
+        final CharacterIterator it = new StringCharacterIterator(singleRecordData);
+        _valueIndex = 0;
+
+        for (char c = it.first(); c != CharacterIterator.DONE; c = it.next()) {
+            processCharacter(c, nextValue, values, singleRecordData);
+        }
+
+        if (nextValue.length() > 0) {
+            addNewValueIfAppropriate(values, nextValue);
+        }
+    }
+
+    String readSingleRecordData() throws IOException {
+        StringBuilder line = new StringBuilder();
+        int ch;
+
+        for (ch = _stream.read(); !isEndingCharacter(ch); ch = _stream.read()) {
+            line.append((char) ch);
+        }
+
+        if (ch == CARRIAGE_RETURN) {
+            readLineFeedIfFollows();
+        }
+
+        return (line.length()) > 0 ? line.toString() : null;
+    }
+    
+    private void readLineFeedIfFollows() throws IOException {
+        _stream.mark(1);
+
+        if (_stream.read() != LINE_FEED) {
+            _stream.reset();
+        }
+    }
+
+    private boolean isEndingCharacter(int ch) {
+        return (ch == CARRIAGE_RETURN || ch == LINE_FEED || ch == END_OF_STREAM);
+    }
+    
+    private void processCharacter(char c, StringBuilder nextValue, List<String> values, String recordData) {
+        nextValue.append(c);
+        final int valueWidth = getValueWidth(values, recordData);
+
+        if (nextValue.length() == valueWidth) {
+            addNewValueIfAppropriate(values, nextValue);
+            nextValue.setLength(0); // clear the buffer
+
+            if (_valueWidths != null) {
+                _valueIndex = (_valueIndex + 1) % _valueWidths.length;
+            }
+        }
+    }
+
+    private int getValueWidth(List<String> values, String recordData) {
+        if (_constantWidth) {
+            return _fixedValueWidth;
+        } else {
+            if (_valueIndex >= _valueWidths.length) {
+                if (_failOnInconsistentLineWidth) {
+                    String[] result = values.toArray(new String[values.size()]);
+                    throw new InconsistentValueWidthException(result, recordData, _rowNumber + 1);
+                } else {
+                    return -1; // silently ignore the inconsistency
+                }
+            }
+
+            return _valueWidths[_valueIndex];
+        }
+    }
+
+    private void addNewValueIfAppropriate(List<String> values, StringBuilder nextValue) {
+        if (_valueWidths != null) {
+            if (values.size() < _valueWidths.length) {
+                values.add(nextValue.toString().trim());
+            }
+        } else {
+            values.add(nextValue.toString().trim());
+        }
+    }
+
+    private String[] correctResult(String[] result) {
+        if (result.length != _valueWidths.length) {
+            String[] correctedResult = new String[_valueWidths.length];
+
+            for (int i = 0; i < result.length && i < _valueWidths.length; i++) {
+                correctedResult[i] = result[i];
+            }
+
+            result = correctedResult;
+        }
 
-       @Override
-       public void close() throws IOException {
-               _reader.close();
-       }
+        return result;
+    }
 
+    @Override
+    public void close() throws IOException {
+        _stream.close();
+    }
 }
diff --git a/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java b/fixedwidth/src/test/java/org/apache/metamodel/fixedwidth/EBCDICTest.java
new file mode 100644 (file)
index 0000000..ea19960
--- /dev/null
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.metamodel.fixedwidth;
+
+import java.io.File;
+
+import org.apache.metamodel.data.DataSet;
+import org.apache.metamodel.schema.Schema;
+import org.apache.metamodel.schema.Table;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class EBCDICTest {
+    private static final int[] COLUMN_WIDTHS = new int[] { 2, 7, 10, 10 };
+    private static final long EXPECTED_ROWS_COUNT = 49; // 50 lines, 1. is a header
+    private static final String ENCODING = "IBM500";
+    private static final String[] EXPECTED_ROWS = new String[] {
+            "Row[values=[01, name-01, surname-01, address-01]]",
+            "Row[values=[02, name-02, surname-02, address-02]]",
+            "Row[values=[03, name-03, surname-03, address-03]]",
+    };
+    private final FixedWidthDataContext _context;
+    private final Table _table;
+
+    public EBCDICTest() {
+        String fileName = "fixed-width-2-7-10-10.ebc";
+        FixedWidthConfiguration configuration = new EbcdicConfiguration(FixedWidthConfiguration.NO_COLUMN_NAME_LINE,
+                ENCODING, COLUMN_WIDTHS, false, true, false);
+        _context = new FixedWidthDataContext(new File("src/test/resources/" + fileName), configuration);
+        Schema schema = _context.getDefaultSchema();
+        _table = schema.getTableByName(fileName);
+    }
+
+    @Test
+    public void testRowsCount() throws Exception {
+        long rows = 0;
+
+        try (final DataSet dataSet = _context.query().from(_table).selectCount().execute()) {
+            if (dataSet.next()) {
+                Object[] values = dataSet.getRow().getValues();
+                rows = (long) values[0];
+            }
+        }
+
+        assertEquals(EXPECTED_ROWS_COUNT, rows);
+    }
+
+    @Test
+    public void testFirstRows() throws Exception {
+        int limit = EXPECTED_ROWS.length;
+        int i = 0;
+
+        try (final DataSet dataSet = _context.query().from(_table).selectAll().limit(limit).execute()) {
+            while (dataSet.next()) {
+                assertEquals(EXPECTED_ROWS[i], dataSet.getRow().toString());
+                i++;
+            }
+        }
+    }
+}
index 8225be0..f03d633 100644 (file)
@@ -18,8 +18,6 @@
  */
 package org.apache.metamodel.fixedwidth;
 
-import org.apache.metamodel.fixedwidth.FixedWidthConfiguration;
-
 import junit.framework.TestCase;
 
 public class FixedWidthConfigurationTest extends TestCase {
@@ -31,14 +29,11 @@ public class FixedWidthConfigurationTest extends TestCase {
        }
 
        public void testEquals() throws Exception {
-               FixedWidthConfiguration conf1 = new FixedWidthConfiguration(1, "UTF8",
-                               10, true);
-               FixedWidthConfiguration conf2 = new FixedWidthConfiguration(1, "UTF8",
-                               10, true);
+               FixedWidthConfiguration conf1 = new FixedWidthConfiguration(1, "UTF8", 10, true);
+               FixedWidthConfiguration conf2 = new FixedWidthConfiguration(1, "UTF8", 10, true);
                assertEquals(conf1, conf2);
 
-               FixedWidthConfiguration conf3 = new FixedWidthConfiguration(1, "UTF8",
-                               10, false);
+               FixedWidthConfiguration conf3 = new FixedWidthConfiguration(1, "UTF8", 10, false);
                assertFalse(conf1.equals(conf3));
        }
 }
index 2ac3680..7962cf6 100644 (file)
@@ -25,9 +25,6 @@ import junit.framework.TestCase;
 
 import org.apache.metamodel.DataContext;
 import org.apache.metamodel.data.DataSet;
-import org.apache.metamodel.fixedwidth.FixedWidthConfiguration;
-import org.apache.metamodel.fixedwidth.FixedWidthDataContext;
-import org.apache.metamodel.fixedwidth.InconsistentValueWidthException;
 import org.apache.metamodel.query.Query;
 import org.apache.metamodel.schema.Schema;
 import org.apache.metamodel.schema.Table;
index 4d11f0e..8f40c1d 100644 (file)
  */
 package org.apache.metamodel.fixedwidth;
 
-import static org.junit.Assert.assertEquals;
-
-import java.io.BufferedReader;
+import java.io.BufferedInputStream;
 import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.Arrays;
 
@@ -30,7 +28,10 @@ import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
 
+import static org.junit.Assert.assertEquals;
+
 public class FixedWidthReaderTest {
+    private static final String CHARSET = "UTF-8";
 
     @Rule
     public final ExpectedException exception = ExpectedException.none();
@@ -38,9 +39,9 @@ public class FixedWidthReaderTest {
     @Test
     public void testBufferedReader1() throws IOException {
         final File file = new File("src/test/resources/example_simple1.txt");
-        final BufferedReader reader = new BufferedReader(new FileReader(file));
+        final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
         int[] widths = new int[] { 8, 9 };
-        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, widths, false)) {
+        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, widths, false)) {
             final String[] line1 = fixedWidthReader.readLine();
             assertEquals("[greeting, greeter]", Arrays.asList(line1).toString());
             final String[] line2 = fixedWidthReader.readLine();
@@ -53,9 +54,9 @@ public class FixedWidthReaderTest {
     @Test
     public void testBufferedReader2() throws IOException {
         final File file = new File("src/test/resources/example_simple2.txt");
-        final BufferedReader reader = new BufferedReader(new FileReader(file));
+        final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
         int[] widths = new int[] {1, 8, 9 };
-        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, widths, false)) {
+        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, widths, false)) {
             final String[] line1 = fixedWidthReader.readLine();
             assertEquals("[i, greeting, greeter]", Arrays.asList(line1).toString());
             final String[] line2 = fixedWidthReader.readLine();
@@ -68,8 +69,8 @@ public class FixedWidthReaderTest {
     @Test
     public void testBufferedReader3() throws IOException {
         final File file = new File("src/test/resources/example_simple3.txt");
-        final BufferedReader reader = new BufferedReader(new FileReader(file));
-        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, 5, false)) {
+        final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
+        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, 5, false)) {
             final String[] line1 = fixedWidthReader.readLine();
             assertEquals("[hello]", Arrays.asList(line1).toString());
             final String[] line2 = fixedWidthReader.readLine();
@@ -84,8 +85,8 @@ public class FixedWidthReaderTest {
     @Test
     public void testBufferedReaderFailOnInconsistentRows() throws IOException {
         final File file = new File("src/test/resources/example_simple3.txt");
-        final BufferedReader reader = new BufferedReader(new FileReader(file));
-        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(reader, 5, true)) {
+        final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
+        try (final FixedWidthReader fixedWidthReader = new FixedWidthReader(stream, CHARSET, 5, true)) {
             final String[] line1 = fixedWidthReader.readLine();
             assertEquals("[hello]", Arrays.asList(line1).toString());
             final String[] line2 = fixedWidthReader.readLine();
@@ -98,6 +99,4 @@ public class FixedWidthReaderTest {
             final String[] line4 = fixedWidthReader.readLine();
         }
     }
-
-   
 }
diff --git a/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc b/fixedwidth/src/test/resources/fixed-width-2-7-10-10.ebc
new file mode 100644 (file)
index 0000000..09fcc70
--- /dev/null
@@ -0,0 +1 @@
+ÉÄÕÁÔÅ`ÉÄâäÙÕÁÔÅ`ÉÄÁÄÄÙÅââ`ÉÄðñ\95\81\94\85`ðñ¢¤\99\95\81\94\85`ðñ\81\84\84\99\85¢¢`ðñðò\95\81\94\85`ðò¢¤\99\95\81\94\85`ðò\81\84\84\99\85¢¢`ðòðó\95\81\94\85`ðó¢¤\99\95\81\94\85`ðó\81\84\84\99\85¢¢`ðóðô\95\81\94\85`ðô¢¤\99\95\81\94\85`ðô\81\84\84\99\85¢¢`ðôðõ\95\81\94\85`ðõ¢¤\99\95\81\94\85`ðõ\81\84\84\99\85¢¢`ðõðö\95\81\94\85`ðö¢¤\99\95\81\94\85`ðö\81\84\84\99\85¢¢`ðöð÷\95\81\94\85`ð÷¢¤\99\95\81\94\85`ð÷\81\84\84\99\85¢¢`ð÷ðø\95\81\94\85`ðø¢¤\99\95\81\94\85`ðø\81\84\84\99\85¢¢`ðøðù\95\81\94\85`ðù¢¤\99\95\81\94\85`ðù\81\84\84\99\85¢¢`ðùñð\95\81\94\85`ñð¢¤\99\95\81\94\85`ñð\81\84\84\99\85¢¢`ñðññ\95\81\94\85`ññ¢¤\99\95\81\94\85`ññ\81\84\84\99\85¢¢`ñññò\95\81\94\85`ñò¢¤\99\95\81\94\85`ñò\81\84\84\99\85¢¢`ñòñó\95\81\94\85`ñó¢¤\99\95\81\94\85`ñó\81\84\84\99\85¢¢`ñóñô\95\81\94\85`ñô¢¤\99\95\81\94\85`ñô\81\84\84\99\85¢¢`ñôñõ\95\81\94\85`ñõ¢¤\99\95\81\94\85`ñõ\81\84\84\99\85¢¢`ñõñö\95\81\94\85`ñö¢¤\99\95\81\94\85`ñö\81\84\84\99\85¢¢`ñöñ÷\95\81\94\85`ñ÷¢¤\99\95\81\94\85`ñ÷\81\84\84\99\85¢¢`ñ÷ñø\95\81\94\85`ñø¢¤\99\95\81\94\85`ñø\81\84\84\99\85¢¢`ñøñù\95\81\94\85`ñù¢¤\99\95\81\94\85`ñù\81\84\84\99\85¢¢`ñùòð\95\81\94\85`òð¢¤\99\95\81\94\85`òð\81\84\84\99\85¢¢`òðòñ\95\81\94\85`òñ¢¤\99\95\81\94\85`òñ\81\84\84\99\85¢¢`òñòò\95\81\94\85`òò¢¤\99\95\81\94\85`òò\81\84\84\99\85¢¢`òòòó\95\81\94\85`òó¢¤\99\95\81\94\85`òó\81\84\84\99\85¢¢`òóòô\95\81\94\85`òô¢¤\99\95\81\94\85`òô\81\84\84\99\85¢¢`òôòõ\95\81\94\85`òõ¢¤\99\95\81\94\85`òõ\81\84\84\99\85¢¢`òõòö\95\81\94\85`òö¢¤\99\95\81\94\85`òö\81\84\84\99\85¢¢`òöò÷\95\81\94\85`ò÷¢¤\99\95\81\94\85`ò÷\81\84\84\99\85¢¢`ò÷òø\95\81\94\85`òø¢¤\99\95\81\94\85`òø\81\84\84\99\85¢¢`òøòù\95\81\94\85`òù¢¤\99\95\81\94\85`òù\81\84\84\99\85¢¢`òùóð\95\81\94\85`óð¢¤\99\95\81\94\85`óð\81\84\84\99\85¢¢`óðóñ\95\81\94\85`óñ¢¤\99\95\81\94\85`óñ\81\84\84\99\85¢¢`óñóò\95\81\94\85`óò¢¤\99\95\81\94\85`óò\81\84\84\99\85¢¢`óòóó\95\81\94\85`óó¢¤\99\95\81\94\85`óó\81\84\84\99\85¢¢`óóóô\95\81\94\85`óô¢¤\99\95\81\94\85`óô\81\84\84\99\85¢¢`óôóõ\95\81\94\85`óõ¢¤\99\95\81\94\85`óõ\81\84\84\99\85¢¢`óõóö\95\81\94\85`óö¢¤\99\95\81\94\85`óö\81\84\84\99\85¢¢`óöó÷\95\81\94\85`ó÷¢¤\99\95\81\94\85`ó÷\81\84\84\99\85¢¢`ó÷óø\95\81\94\85`óø¢¤\99\95\81\94\85`óø\81\84\84\99\85¢¢`óøóù\95\81\94\85`óù¢¤\99\95\81\94\85`óù\81\84\84\99\85¢¢`óùôð\95\81\94\85`ôð¢¤\99\95\81\94\85`ôð\81\84\84\99\85¢¢`ôðôñ\95\81\94\85`ôñ¢¤\99\95\81\94\85`ôñ\81\84\84\99\85¢¢`ôñôò\95\81\94\85`ôò¢¤\99\95\81\94\85`ôò\81\84\84\99\85¢¢`ôòôó\95\81\94\85`ôó¢¤\99\95\81\94\85`ôó\81\84\84\99\85¢¢`ôóôô\95\81\94\85`ôô¢¤\99\95\81\94\85`ôô\81\84\84\99\85¢¢`ôôôõ\95\81\94\85`ôõ¢¤\99\95\81\94\85`ôõ\81\84\84\99\85¢¢`ôõôö\95\81\94\85`ôö¢¤\99\95\81\94\85`ôö\81\84\84\99\85¢¢`ôöô÷\95\81\94\85`ô÷¢¤\99\95\81\94\85`ô÷\81\84\84\99\85¢¢`ô÷ôø\95\81\94\85`ôø¢¤\99\95\81\94\85`ôø\81\84\84\99\85¢¢`ôøôù\95\81\94\85`ôù¢¤\99\95\81\94\85`ôù\81\84\84\99\85¢¢`ôù
\ No newline at end of file