Java: How to deal with the BOM in a Unicode InputStream

Ok, so I was happily reading CSV files from an SFTP server. The file content is returned as an InputStream and I I used a BufferedReader to read it line by line. Each line contained either a header or an order. The header lines started with the string “HDR”.

However, I suddenly discovered that my code was consistently skipping the first header (and as a result the orders belonging to it). The reason, I found, was simple. The first header, on the first line, didn’t start with “HDR”, it started with “รขโ€“ยกHDR”! And that undisplayable square turned out to be a Unicode Byte Order Mark (BOM).

To deal with the BOM, we can use a simple class I found at StackOverflow which handles it for us. Here it is ๐Ÿ™‚

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * The {@link UnicodeBOMInputStream} class wraps any
 * {@link InputStream} and detects the presence of any Unicode BOM
 * (Byte Order Mark) at its beginning, as defined by
 * <a href="http://www.faqs.org/rfcs/rfc3629.html">RFC 3629 - UTF-8, a transformation format of ISO 10646</a>
 *
 * <p>The
 * <a href="http://www.unicode.org/unicode/faq/utf_bom.html">Unicode FAQ</a>
 * defines 5 types of BOMs:<ul>
 * <li><pre>00 00 FE FF  = UTF-32, big-endian</pre></li>
 * <li><pre>FF FE 00 00  = UTF-32, little-endian</pre></li>
 * <li><pre>FE FF        = UTF-16, big-endian</pre></li>
 * <li><pre>FF FE        = UTF-16, little-endian</pre></li>
 * <li><pre>EF BB BF     = UTF-8</pre></li>
 * </ul></p>
 *
 * <p>Use the {@link #getBOM()} method to know whether a BOM has been detected
 * or not.
 * </p>
 * <p>Use the {@link #skipBOM()} method to remove the detected BOM from the
 * wrapped {@link InputStream} object.</p>
 *
 * @author Gregory Pakosz
 * @see http://stackoverflow.com/q/1835430/39321#1835529
 */

class UnicodeBOMInputStream extends InputStream
{
  /**
   * Type safe enumeration class that describes the different types of Unicode
   * BOMs.
   */

  public static final class BOM
  {
    /**
     * NONE.
     */

    public static final BOM NONE = new BOM(new byte[]{},"NONE");

    /**
     * UTF-8 BOM (EF BB BF).
     */

    public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
                                                       (byte)0xBB,
                                                       (byte)0xBF},
                                            "UTF-8");

    /**
     * UTF-16, little-endian (FF FE).
     */

    public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
                                                            (byte)0xFE},
                                                "UTF-16 little-endian");

    /**
     * UTF-16, big-endian (FE FF).
     */

    public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
                                                            (byte)0xFF},
                                                "UTF-16 big-endian");

    /**
     * UTF-32, little-endian (FF FE 00 00).
     */

    public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
                                                            (byte)0xFE,
                                                            (byte)0x00,
                                                            (byte)0x00},
                                                "UTF-32 little-endian");

    /**
     * UTF-32, big-endian (00 00 FE FF).
     */

    public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
                                                            (byte)0x00,
                                                            (byte)0xFE,
                                                            (byte)0xFF},
                                                "UTF-32 big-endian");

    /**
     * Returns a {@link String} representation of this {@link BOM}.
     * value.
     */

    public final String toString()
    {
      return description;
    }

    /**
     * Returns the bytes corresponding to this {@link BOM} value.
     */

    public final byte[] getBytes()
    {
      final int     length = bytes.length;
      final byte[]  result = new byte[length];

      // Make a defensive copy
      System.arraycopy(bytes,0,result,0,length);

      return result;
    }

    private BOM(final byte bom[], final String description)
    {
      this.bytes          = bom;
      this.description  = description;
    }

            final byte    bytes[];
    private final String  description;

  }

  /**
   * Constructs a new {@link UnicodeBOMInputStream} that wraps the
   * specified {@link InputStream}.
   *
   * @param inputStream an {@link InputStream}.
   *
   * @throws IOException on reading from the specified {@link InputStream}
   * when trying to detect the Unicode BOM.
   */

  public UnicodeBOMInputStream(final InputStream inputStream) throws IOException

  {
    in = new PushbackInputStream(inputStream,4);

    final byte  bom[] = new byte[4];
    final int   read  = in.read(bom);

    switch(read)
    {
      case 4:
        if ((bom[0] == (byte)0xFF) &&
            (bom[1] == (byte)0xFE) &&
            (bom[2] == (byte)0x00) &&
            (bom[3] == (byte)0x00))
        {
          this.bom = BOM.UTF_32_LE;
          break;
        }
        else
        if ((bom[0] == (byte)0x00) &&
            (bom[1] == (byte)0x00) &&
            (bom[2] == (byte)0xFE) &&
            (bom[3] == (byte)0xFF))
        {
          this.bom = BOM.UTF_32_BE;
          break;
        }

      case 3:
        if ((bom[0] == (byte)0xEF) &&
            (bom[1] == (byte)0xBB) &&
            (bom[2] == (byte)0xBF))
        {
          this.bom = BOM.UTF_8;
          break;
        }

      case 2:
        if ((bom[0] == (byte)0xFF) &&
            (bom[1] == (byte)0xFE))
        {
          this.bom = BOM.UTF_16_LE;
          break;
        }
        else
        if ((bom[0] == (byte)0xFE) &&
            (bom[1] == (byte)0xFF))
        {
          this.bom = BOM.UTF_16_BE;
          break;
        }

      default:
        this.bom = BOM.NONE;
        break;
    }

    if (read > 0)
      in.unread(bom,0,read);
  }

  /**
   * Returns the {@link BOM} that was detected in the wrapped
   * {@link InputStream} object.
   *
   * @return a {@link BOM} value.
   */

  public final BOM getBOM()
  {
    // BOM type is immutable.
    return bom;
  }

  /**
   * Skips the {@link BOM} that was found in the wrapped
   * {@link InputStream} object.
   *
   * @return this {@link UnicodeBOMInputStream}.
   *
   * @throws IOException when trying to skip the BOM from the wrapped {@link InputStream} object.
   */

  public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
  {
    if ( ! skipped)
    {
      in.skip(bom.bytes.length);
      skipped = true;
    }
    return this;
  }

  /**
   * {@inheritDoc}
   */

  public int read() throws IOException
  {
    return in.read();
  }

  /**
   * {@inheritDoc}
   */

  public int read(final byte b[]) throws  IOException,
                                          NullPointerException
  {
    return in.read(b,0,b.length);
  }

  /**
   * {@inheritDoc}
   */

  public int read(final byte b[],
                  final int off,
                  final int len) throws IOException,
                                        NullPointerException
  {
    return in.read(b,off,len);
  }

  /**
   * {@inheritDoc}
   */

  public long skip(final long n) throws IOException
  {
    return in.skip(n);
  }

  /**
   * {@inheritDoc}
   */

  public int available() throws IOException
  {
    return in.available();
  }

  /**
   * {@inheritDoc}
   */

  public void close() throws IOException
  {
    in.close();
  }

  /**
   * {@inheritDoc}
   */

  public synchronized void mark(final int readlimit)
  {
    in.mark(readlimit);
  }

  /**
   * {@inheritDoc}
   */

  public synchronized void reset() throws IOException
  {
    in.reset();
  }

  /**
   * {@inheritDoc}
   */

  public boolean markSupported()
  {
    return in.markSupported();
  }

  private final PushbackInputStream in;
  private final BOM                 bom;
  private       boolean             skipped = false;

}

Looks long, but it’s simple to use.

InputStream cleanStream = new UnicodeBOMInputStream(stream).skipBOM();

When reading the stream, remember to also use an InputStreamReader with the correct character set. For example something like this:

BufferedReader reader = new BufferedReader(new InputStreamReader(cleanStream, "UTF-8"));
String line = null;
while((line = reader.readLine()) != null)
    System.out.println(line);
reader.close();

If the file might be any of the Unicode kinds, you can probably use the getBOM method of the UnicodeBOMInputStream to choose the right one.

To sum up, this is tricky and annoying stuff! Please, if something in this post is wrong or inaccurate or should be done differently, please leave a comment. Want to get this right ๐Ÿ™‚

Found an alternative to the above class in the Apache Commons IO library. It’s called BOMInputStream and seems to do pretty much the same thing, except it has some extra features as well. Might be nice to use if you’re already using that library or don’t mind adding another. ๐Ÿ™‚