xiaobaoqiu Blog

Think More, Code Less

How to Read Files Quickly

Java中有很多读写文件或者网络流的方法,特别是Java7中加入了AIO之后,可选择的方法更多,本文的目的是想比较一下各种方式的优劣。

本文基本是参考这篇英文:http://nadeausoftware.com/articles/2008/02/java_tip_how_read_files_quikcly

1.方法List

这里涉及的读文件的方法包括:

1.1 FileInputStreamOneByte

1
2
3
4
5
6
7
8
9
10
@Override
public long doRead() throws IOException {
    long checkSum = 0L;
    FileInputStream fis = new FileInputStream(filePath);
    int b;
    while ((b = fis.read()) != -1)
        checkSum += b;
    Closeables.close(fis, false);
    return checkSum;
}

1.2 FileInputStreamBytes

1
2
3
4
5
6
7
8
9
10
11
12
13
@Override
    public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        byte[] array = new byte[BUF_SIZE];
        int nRead;
        while ((nRead = fis.read(array, 0, BUF_SIZE)) != -1) {
            for (int i = 0; i < nRead; i++)
                checkSum += array[i];
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.3 BufferedInputStreamOneByte

1
2
3
4
5
6
7
8
9
10
@Override
    public long doRead() throws IOException {
        long checkSum = 0L;
        BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filePath), BUF_SIZE);
        int b;
        while ((b = bis.read()) != -1)
            checkSum += b;
        Closeables.close(bis, false);
        return checkSum;
    }

1.4 BufferedInputStreamBytes

1
2
3
4
5
6
7
8
9
10
11
12
13
@Override
    public long doRead() throws IOException {
        long checkSum = 0L;
        BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filePath), BUF_SIZE);
        byte[] array = new byte[BUF_SIZE];
        int nRead;
        while ((nRead = bis.read(array, 0, BUF_SIZE)) != -1) {
            for (int i = 0; i < nRead; i++)
                checkSum += array[i];
        }
        Closeables.close(bis, false);
        return checkSum;
    }

1.5 RandomAccessOneByte

1
2
3
4
5
6
7
8
9
10
public long doRead() throws IOException {
        long checkSum = 0L;
        RandomAccessFile raf = new RandomAccessFile(filePath, READ_MODE);
        int b;
        while ((b = raf.read()) != -1) {
            checkSum += b;
        }
        Closeables.close(raf, false);
        return checkSum;
    }

1.6 RandomAccessBytes

1
2
3
4
5
6
7
8
9
10
11
12
public long doRead() throws IOException {
        long checkSum = 0L;
        RandomAccessFile raf = new RandomAccessFile(filePath, READ_MODE);
        byte[] array = new byte[BUF_SIZE];
        int nRead;
        while ((nRead = raf.read(array, 0, BUF_SIZE)) != -1) {
            for (int i = 0; i < nRead; i++)
                checkSum += array[i];
        }
        Closeables.close(raf, false);
        return checkSum;
    }

1.7 FileChannelByteBufferOneByte

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        ByteBuffer bb = ByteBuffer.allocate(BUF_SIZE);
        int nRead;
        while ((nRead = ch.read(bb)) != -1) {
            if (nRead == 0)
                continue;
            bb.position(0);
            bb.limit(nRead);
            while (bb.hasRemaining())
                checkSum += bb.get(); // 从ByteBuffer读一个字节
            bb.clear();
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.8 FileChannelByteBufferBytes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        ByteBuffer bb = ByteBuffer.allocate(BIG_BUF_SIZE);
        byte[] array = new byte[BUF_SIZE];
        int nRead, nGet;
        while ((nRead = ch.read(bb)) != -1) {
            if (nRead == 0)
                continue;
            bb.position(0);
            bb.limit(nRead);
            while (bb.hasRemaining()) {
                nGet = Math.min(bb.remaining(), BUF_SIZE);
                bb.get(array, 0, nGet);
                for (int i = 0; i < nGet; i++)
                    checkSum += array[i];
            }
            bb.clear();
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.9 FileChannelByteBufferWrap

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        byte[] barray = new byte[BUF_SIZE];
        ByteBuffer bb = ByteBuffer.wrap(barray); // ByteBuffer包装本地array
        int nRead;
        while ((nRead = ch.read(bb)) != -1) {
            for (int i = 0; i < nRead; i++)
                checkSum += barray[i];
            bb.clear();
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.10 FileChannelMappedByteBufferOneByte

1
2
3
4
5
6
7
8
9
10
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        MappedByteBuffer mb = ch.map(FileChannel.MapMode.READ_ONLY, 0L, ch.size());
        while (mb.hasRemaining())
            checkSum += mb.get();
        Closeables.close(fis, false);
        return checkSum;
    }

1.11 FileChannelMappedByteBufferBytes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        MappedByteBuffer mb = ch.map(FileChannel.MapMode.READ_ONLY, 0L, ch.size());
        byte[] barray = new byte[BUF_SIZE];
        int nGet;
        while (mb.hasRemaining()) {
            nGet = Math.min(mb.remaining(), BUF_SIZE);
            mb.get(barray, 0, nGet);
            for (int i = 0; i < nGet; i++)
                checkSum += barray[i];
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.12 FileChannelDirectByteBufferOneByte

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        ByteBuffer bb = ByteBuffer.allocateDirect(BUF_SIZE);
        int nRead;
        while ((nRead = ch.read(bb)) != -1) {
            bb.position(0);
            bb.limit(nRead);
            while (bb.hasRemaining())
                checkSum += bb.get();
            bb.clear();
        }
        Closeables.close(fis, false);
        return checkSum;
    }

1.13 FileChannelDirectByteBufferBytes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
public long doRead() throws IOException {
        long checkSum = 0L;
        FileInputStream fis = new FileInputStream(filePath);
        FileChannel ch = fis.getChannel();
        ByteBuffer bb = ByteBuffer.allocateDirect(BIG_BUF_SIZE);
        byte[] barray = new byte[BUF_SIZE];
        int nRead, nGet;
        while ((nRead = ch.read(bb)) != -1) {
            if (nRead == 0)
                continue;
            bb.position(0);
            bb.limit(nRead);
            while (bb.hasRemaining()) {
                nGet = Math.min(bb.remaining(), BUF_SIZE);
                bb.get(barray, 0, nGet);
                for (int i = 0; i < nGet; i++)
                    checkSum += barray[i];
            }
            bb.clear();
        }
        Closeables.close(fis, false);
        return checkSum;
    }

2.测试

测试文件的大小为83M:

1
2
xiaobaoqiu@xiaobaoqiu:~/Documents/TestData$ ll -h BXBooks.sql
-rw------- 1 xiaobaoqiu xiaobaoqiu 83M 10月  8  2004 BXBooks.sql

3.结论

得到测试数据之后,为了直观展示代码的速度,用ECharts展示,对应的js代码如下(可以在http://echarts.baidu.com/doc/example/line1.html#helianthus%E4%B8%AD%E4%BD%BF%E7%94%A8):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
option = {
    title : {
        text: 'how_to_read_file_quickly',
        subtext: 'baoqiu.xiao'
    },
    tooltip : {
        trigger: 'axis'
    },
    legend: {
        data:['FileInputStreamOneByte','FileInputStreamBytes','BufferedInputStreamOneByte','BufferedInputStreamBytes','RandomAccessOneByte','RandomAccessBytes','FileChannelByteBufferOneByte','FileChannelByteBufferBytes','FileChannelByteBufferWrap','FileChannelMappedByteBufferOneByte','FileChannelMappedByteBufferBytes','FileChannelDirectByteBufferOneByte','FileChannelDirectByteBufferBytes']
    },
    dataZoom : {
        show : true,
        realtime: true,
        start : 0,
        end : 30
    },
    toolbox: {
        show : true,
        feature : {
            saveAsImage : {show: true}
        }
    },
    calculable : true,
    xAxis : [
        {
            type : 'category',
            boundaryGap : false,
            data : ['1','2','4','8','16','32','64','128','256','512','1K','2K','4K','8K','16K','32K','64K','128K']
        }
    ],
    yAxis : [
        {
            type : 'value',
            axisLabel : {
                formatter: '{value} 毫秒'
            }
        }
    ],
    series : [
        {
            name:'FileInputStreamOneByte',
            type:'line',
            data:[29119,28353,27872,27950,32035,40377,34904,36804,36835,33177,32252,37362,32359,31892,32621,32312,32943,33369]
        },
        {
            name:'FileInputStreamBytes',
            type:'line',
            data:[38182,20208,10194,4768,2616,1551,645,471,262,183,150,100,118,125,96,83,124,110]
        },
        {
            name:'BufferedInputStreamOneByte',
            type:'line',
            data:[33404,16966,8613,4718,2525,1491,980,728,677,681,558,608,557,615,505,606,656,693]
        },
        {
            name:'BufferedInputStreamBytes',
            type:'line',
            data:[33880,17333,8487,4851,2293,1632,627,358,242,147,129,113,89,92,107,88,100,95]
        },
        {
            name:'RandomAccessOneByte',
            type:'line',
            data:[28430,27445,26794,27053,28929,28633,29509,28870,28195,27376,28236,27444,28061,28889,27423,28064,28481,28084]
        },
        {
            name:'RandomAccessBytes',
            type:'line',
            data:[31497,16162,8135,4365,2131,1117,561,321,193,146,110,101,109,87,89,91,102,96]
        },
        {
            name:'FileChannelByteBufferOneByte',
            type:'line',
            data:[38778,18325,9447,5344,2430,1413,779,507,380,310,298,259,267,263,249,264,260,258]
        },
        {
            name:'FileChannelByteBufferBytes',
            type:'line',
            data:[19279,9823,5090,2605,1407,854,577,380,333,272,272,239,237,241,237,234,249,270]
        },
        {
            name:'FileChannelByteBufferWrap',
            type:'line',
            data:[38382,17636,9094,4608,2327,1198,646,355,249,151,108,100,92,90,88,103,85,92]
        },
        {
            name:'FileChannelMappedByteBufferOneByte',
            type:'line',
            data:[913,240,92,83,82,87,85,88,83,94,87,90,81,124,81,84,83,120]
        },
        {
            name:'FileChannelMappedByteBufferBytes',
            type:'line',
            data:[731,447,305,332,225,164,139,110,98,88,87,140,97,86,93,92,92,92]
        },
        {
            name:'FileChannelDirectByteBufferOneByte',
            type:'line',
            data:[31287,14473,7296,3815,1919,1001,547,319,200,135,109,98,84,77,80,82,77,86]
        },
        {
            name:'FileChannelDirectByteBufferBytes',
            type:'line',
            data:[14945,8060,4107,2174,1120,580,334,206,149,110,96,95,79,89,81,81,86,84]
        }       
    ]
};

整体展示图如下:

3.1 最快的三种方法

为了展示更细节的地方,设置了y区间范围,在yAxis上设置min和max属性:

1
2
3
4
5
6
7
8
9
10
yAxis : [
        {
            type : 'value',
            min:0,
            max:200,
            axisLabel : {
                formatter: '{value} 毫秒'
            }
        }
    ],

得到如下展示图:

从上面这个图可以大致看出,最快的三种方法(PS,这里只做了一次实验,不是多次实验的平均值,所有数据可能不一定准确):

1.FileChannelMappedByteBufferBytes
2.FileChannelMappedByteBufferOneByte
3.FileChannelDirectByteBufferBytes

4.解析

下面解析一下代码中之前接触较少的东东。

4.1 RandomAccessFile

支持对文件的随机读取和写入。随机存取文件的行为类似存储在文件系统中的一个大型字节数组。存在指向该隐含数组的光标或索引,称为文件指针。

读取操作从文件指针开始读取字节,并随着对字节的读取而前移此文件指针。如果随机存取文件以读取/写入模式创建,则写入操作也可用;写入操作从文件指针开始写入字节,并随着对字节的写入而前移此文件指针。该文件指针可以通过 getFilePointer 方法读取,并通过 seek 方法设置。

提供了boolean,byte,char, short, int, long, float, double这些基本类型的read和write方法。

4.2 FileChannel

FileChannel是一个连接到文件的通道。可以通过文件通道读写文件。FileChannel无法设置为非阻塞模式,它总是运行在阻塞模式下。

在使用FileChannel之前,必须先打开它。但是,我们无法直接打开一个FileChannel,需要通过使用一个InputStream、OutputStream或RandomAccessFile来获取一个FileChannel实例。

1
2
FileInputStream fis = new FileInputStream(filePath);
FileChannel ch = fis.getChannel();

有时可能需要在FileChannel的某个特定位置进行数据的读/写操作。可以通过调用position()方法获取FileChannel的当前位置。也可以通过调用position(long pos)方法设置FileChannel的当前位置。

FileChannel实例的size()方法将返回该实例所关联文件的大小。

FileChannel.force()方法将通道里尚未写入磁盘的数据强制写到磁盘上。出于性能方面的考虑,操作系统会将数据缓存在内存中,所以无法保证写入到FileChannel里的数据一定会即时写到磁盘上。要保证这一点,需要调用force()方法。

4.3 ByteBuffer

ByteBuffer知识下一次单独讲。

5.BufferedInputStream的defaultBufferSize

BufferedInputStream默认buffer大小为8K:

1
2
3
public class BufferedInputStream extends FilterInputStream {
    private static int defaultBufferSize = 8192;
}

这个从图上也可以比较直观的看到,在缓存为8K左右的时候,性能最好:

5.参考:

http://nadeausoftware.com/articles/2008/02/java_tip_how_read_files_quickly

http://colobu.com/2014/10/20/java-buffer-basic/