提交 9338caa9 作者: Jeromy

working on making importer not break on large files

上级 ddd2a9ae
...@@ -3,7 +3,6 @@ package importer ...@@ -3,7 +3,6 @@ package importer
import ( import (
"fmt" "fmt"
"io" "io"
"io/ioutil"
"os" "os"
dag "github.com/jbenet/go-ipfs/merkledag" dag "github.com/jbenet/go-ipfs/merkledag"
...@@ -20,32 +19,17 @@ var ErrSizeLimitExceeded = fmt.Errorf("object size limit exceeded") ...@@ -20,32 +19,17 @@ var ErrSizeLimitExceeded = fmt.Errorf("object size limit exceeded")
// NewDagFromReader constructs a Merkle DAG from the given io.Reader. // NewDagFromReader constructs a Merkle DAG from the given io.Reader.
// size required for block construction. // size required for block construction.
func NewDagFromReader(r io.Reader, size int64) (*dag.Node, error) { func NewDagFromReader(r io.Reader) (*dag.Node, error) {
// todo: block-splitting based on rabin fingerprinting blkChan := SplitterBySize(1024 * 512)(r)
// todo: block-splitting with user-defined function root := &dag.Node{}
// todo: block-splitting at all. :P
// todo: write mote todos for blk := range blkChan {
child := &dag.Node{Data: blk}
// totally just trusts the reported size. fix later. err := root.AddNodeLink("", child)
if size > BlockSizeLimit { // 1 MB limit for now. if err != nil {
return nil, ErrSizeLimitExceeded return nil, err
}
} }
// Ensure that we dont get stuck reading way too much data
r = io.LimitReader(r, BlockSizeLimit)
// we're doing it live!
buf, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
if int64(len(buf)) > BlockSizeLimit {
return nil, ErrSizeLimitExceeded // lying punk.
}
root := &dag.Node{Data: buf}
// no children for now because not block splitting yet
return root, nil return root, nil
} }
...@@ -66,5 +50,5 @@ func NewDagFromFile(fpath string) (*dag.Node, error) { ...@@ -66,5 +50,5 @@ func NewDagFromFile(fpath string) (*dag.Node, error) {
} }
defer f.Close() defer f.Close()
return NewDagFromReader(f, stat.Size()) return NewDagFromReader(f)
} }
package importer package importer
import ( import (
"testing"
"crypto/rand"
"bytes" "bytes"
"crypto/rand"
"testing"
) )
func TestDataSplitting(t *testing.T) { func TestDataSplitting(t *testing.T) {
buf := make([]byte, 16*1024*1024) buf := make([]byte, 16*1024*1024)
rand.Read(buf) rand.Read(buf)
split := Rabin(buf) split := Rabin(buf)
if len(split) == 1 { if len(split) == 1 {
...@@ -47,4 +46,3 @@ func TestDataSplitting(t *testing.T) { ...@@ -47,4 +46,3 @@ func TestDataSplitting(t *testing.T) {
t.Log(len(split)) t.Log(len(split))
t.Log(min, max, mxcount) t.Log(min, max, mxcount)
} }
package importer package importer
type BlockSplitter func([]byte) [][]byte import (
"io"
u "github.com/jbenet/go-ipfs/util"
)
type BlockSplitter func(io.Reader) chan []byte
func SplitterBySize(n int) BlockSplitter {
return func(r io.Reader) chan []byte {
out := make(chan []byte)
go func(n int) {
defer close(out)
for {
chunk := make([]byte, n)
nread, err := r.Read(chunk)
if err != nil {
if err == io.EOF {
return
}
u.PErr("block split error: %v\n", err)
return
}
if nread < n {
chunk = chunk[:n]
}
out <- chunk
}
}(n)
return out
}
}
// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file? // TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file?
func Rabin(b []byte) [][]byte { func Rabin(b []byte) [][]byte {
...@@ -39,7 +70,7 @@ func Rabin(b []byte) [][]byte { ...@@ -39,7 +70,7 @@ func Rabin(b []byte) [][]byte {
} }
// first 13 bits of polynomial are 0 // first 13 bits of polynomial are 0
if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size { if poly%8192 == 0 && i-blk_beg_i >= min_blk_size {
// push block // push block
out = append(out, b[blk_beg_i:i]) out = append(out, b[blk_beg_i:i])
blk_beg_i = i blk_beg_i = i
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论