diff --git a/.travis.yml b/.travis.yml index 19271c3..3030372 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,8 @@ go: - "1.14" - tip go_import_path: code.sajari.com/docconv +before_install: + - sudo apt-get -y install poppler-utils notifications: email: - infra@sajari.com diff --git a/config.go b/config.go new file mode 100644 index 0000000..7323f29 --- /dev/null +++ b/config.go @@ -0,0 +1,33 @@ +package docconv + +// Config settings for doc conv +type Config struct { + Limitation LenthLimitation +} + +// LenthLimitation page or word limit +type LenthLimitation struct { + // XMLMaxWord max word limit for xml parsing, this will effort office 2007 zip format document + XMLMaxWord int + // PdfFirstPage first page to convert for pdf + PdfFirstPage int + // PdfLastPage last page to convert for pdf + PdfLastPage int +} + +var ( + config Config +) + +// SetConfig set configuration for docconv +func SetConfig(c Config) { + config = c +} + +func checkXMLMaxWord() bool { + return config.Limitation.XMLMaxWord > 0 +} + +func xmlMaxWordExceed(length int) bool { + return length > config.Limitation.XMLMaxWord +} diff --git a/docconv_test.go b/docconv_test.go index 39eaac0..2d5d94a 100644 --- a/docconv_test.go +++ b/docconv_test.go @@ -1,6 +1,7 @@ package docconv import ( + "os" "strings" "testing" ) @@ -18,3 +19,77 @@ func TestConvertTrimsSpace(t *testing.T) { t.Errorf("body = %v, want %v", resp.Body, want) } } + +func TestXMLMaxWord(t *testing.T) { + t.Run("max word not set", func(t *testing.T) { + checkMaxWord := checkXMLMaxWord() + if checkMaxWord != false { + t.Fatalf("got %v, want false", checkMaxWord) + } + }) + t.Run("test checkMaxWord", func(t *testing.T) { + SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 10}}) + checkMaxWord := checkXMLMaxWord() + if checkMaxWord != true { + t.Fatalf("got %v, want true", checkMaxWord) + } + }) + t.Run("test xmlMaxWordExceed", func(t *testing.T) { + SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 10}}) + exceed := xmlMaxWordExceed(10) + if exceed != false { + t.Fatalf("got %v, want false", exceed) + } + exceed = xmlMaxWordExceed(11) + if exceed != true { + t.Fatalf("got %v, want true", exceed) + } + }) + t.Run("test parse pptx with maxword", func(t *testing.T) { + SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 2}}) + f, err := os.Open("./docx_test/testdata/sample_3.docx") + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + + resp, _, err := ConvertDocx(f) + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + if want := "Content from docx file"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "second"; strings.Contains(resp, want) { + t.Errorf("expected %v to not contains %v", resp, want) + } + }) + +} + +func TestPDFPageLimit(t *testing.T) { + SetConfig(Config{Limitation: LenthLimitation{PdfFirstPage: 2, PdfLastPage: 3}}) + f, err := os.Open("./pdf_test/testdata/pdf.pdf") + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + + resp, _, err := ConvertPDF(f) + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + if want := "2"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "3"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "1"; strings.Contains(resp, want) { + t.Errorf("expected %v to not contains %v", resp, want) + } + if want := "4"; strings.Contains(resp, want) { + t.Errorf("expected %v to not contains %v", resp, want) + } + if want := "5"; strings.Contains(resp, want) { + t.Errorf("expected %v to not contains %v", resp, want) + } +} diff --git a/pdf_test/testdata/pdf.pdf b/pdf_test/testdata/pdf.pdf new file mode 100644 index 0000000..9cf8965 Binary files /dev/null and b/pdf_test/testdata/pdf.pdf differ diff --git a/pdf_text.go b/pdf_text.go index 01b65f8..ff49fe2 100644 --- a/pdf_text.go +++ b/pdf_text.go @@ -3,6 +3,7 @@ package docconv import ( "fmt" "os/exec" + "strconv" "strings" "time" ) @@ -54,9 +55,20 @@ func ConvertPDFText(path string) (BodyResult, MetaResult, error) { mr <- metaResult }() + parameters := []string{ + "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", + } + if config.Limitation.PdfFirstPage > 0 { + parameters = append(parameters, "-f", strconv.Itoa(config.Limitation.PdfFirstPage)) + } + if config.Limitation.PdfLastPage > 0 { + parameters = append(parameters, "-l", strconv.Itoa(config.Limitation.PdfLastPage)) + } + parameters = append(parameters, path, "-") + br := make(chan BodyResult, 1) go func() { - body, err := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() + body, err := exec.Command("pdftotext", parameters...).Output() if err != nil { bodyResult.err = err } diff --git a/xml.go b/xml.go index 8f422a3..9435e34 100644 --- a/xml.go +++ b/xml.go @@ -69,6 +69,10 @@ func XMLToText(r io.Reader, breaks []string, skip []string, strict bool) (string } } } + // check max word limit for insufficient memory + if checkXMLMaxWord() && xmlMaxWordExceed(len(result)) { + break + } } return result, nil }