Remove BOM mark from text files in Python

In working with Tensorflow and TFLearn on Windows I frequently run into a problem with my source data files being encoded as UTF-8 with a BOM header. A BOM is a byte order mark, a single unicode character that prefaces the file. Many data loading utilities load with the incorrect encoding and will throw a ValueError about this unexpected character.

The large files often used as training data can be challenging to open/re-encode properly so I created this method to rewrite the file in-place without the BOM mark.

import os, sys, codecs

def remove_bom_inplace(path):
    """Removes BOM mark, if it exists, from a file and rewrites it in-place"""
    buffer_size = 4096
    bom_length = len(codecs.BOM_UTF8)

    with open(path, "r+b") as fp:
        chunk = fp.read(buffer_size)
        if chunk.startswith(codecs.BOM_UTF8):
            i = 0
            chunk = chunk[bom_length:]
            while chunk:
                fp.seek(i)
                fp.write(chunk)
                i += len(chunk)
                fp.seek(bom_length, os.SEEK_CUR)
                chunk = fp.read(buffer_size)
            fp.seek(-bom_length, os.SEEK_CUR)
            fp.truncate()

import os, sys, codecs

def remove_bom_inplace(path):

"""Removes BOM mark, if it exists, from a file and rewrites it in-place"""

buffer_size = 4096

bom_length = len(codecs.BOM_UTF8)

with open(path, "r+b") as fp:

chunk = fp.read(buffer_size)

if chunk.startswith(codecs.BOM_UTF8):

i = 0

chunk = chunk[bom_length:]

while chunk:

fp.seek(i)

fp.write(chunk)

i += len(chunk)

fp.seek(bom_length, os.SEEK_CUR)

chunk = fp.read(buffer_size)

fp.seek(-bom_length, os.SEEK_CUR)

fp.truncate()

I actually use this in my TFLearn training scripts automatically, like this:

try:
    data, labels = load_csv(...)
except ValueError:
    strip_bom.remove_bom_inplace(filename)
    data, labels = load_csv(...)

try:

data, labels = load_csv(...)

except ValueError:

strip_bom.remove_bom_inplace(filename)

data, labels = load_csv(...)

Such that any BOM related error during loading attempts a repair of the file, and a retry automatically.

Leave a Reply Cancel reply