forked from JalfResi/justext
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreviseParagraphClassification.go
118 lines (101 loc) · 3.01 KB
/
reviseParagraphClassification.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package justext
// Context-sensitive paragraph classification. Assumes that classify_pragraphs has already been called.
func reviseParagraphClassification(paragraphs []*Paragraph, maxHeadingDistance int) {
// Copy classes
for _, paragraph := range paragraphs {
paragraph.Class = paragraph.CfClass
}
// Good headings
var j int = 0
var distance int
for i, paragraph := range paragraphs {
if !(paragraph.Heading && paragraph.Class == "short") {
continue
}
j = i + 1
distance = 0
for j < len(paragraphs) && distance <= maxHeadingDistance {
if paragraphs[j].Class == "good" {
paragraph.Class = "neargood"
break
}
distance += len(paragraphs[j].Text)
j += 1
}
}
// Classify short
var newClasses []string = make([]string, len(paragraphs))
for i, paragraph := range paragraphs {
if paragraph.Class != "short" {
continue
}
var prevNeighbour string = getPrevNeighbour(i, paragraphs, true)
var nextNeighbour string = getNextNeighbour(i, paragraphs, true)
var neighbours map[string]bool = make(map[string]bool)
neighbours[prevNeighbour] = true
neighbours[nextNeighbour] = true
if _, ok := neighbours["good"]; ok && len(neighbours) == 1 {
newClasses[i] = "good"
} else if _, ok := neighbours["bad"]; ok && len(neighbours) == 1 {
newClasses[i] = "bad"
// neighbours must contain both good and bad
} else if (prevNeighbour == "bad" && getPrevNeighbour(i, paragraphs, false) == "neargood") || (nextNeighbour == "bad" && getNextNeighbour(i, paragraphs, false) == "neargood") {
newClasses[i] = "good"
} else {
newClasses[i] = "bad"
}
}
for i, c := range newClasses {
if c != "" {
paragraphs[i].Class = c
}
}
// revise neargood
for i, paragraph := range paragraphs {
if paragraph.Class != "neargood" {
continue
}
var prevNeighbour string = getPrevNeighbour(i, paragraphs, true)
var nextNeighbour string = getNextNeighbour(i, paragraphs, true)
if prevNeighbour == "bad" && nextNeighbour == "bad" {
paragraph.Class = "bad"
} else {
paragraph.Class = "good"
}
}
// more good headings
for i, paragraph := range paragraphs {
if !(paragraph.Heading && paragraph.Class == "bad" && paragraph.CfClass != "bad") {
continue
}
j = i + 1
distance = 0
for j < len(paragraphs) && distance <= maxHeadingDistance {
if paragraphs[j].Class == "good" {
paragraph.Class = "good"
break
}
distance += len(paragraphs[j].Text)
j += 1
}
}
}
func getPrevNeighbour(i int, paragraphs []*Paragraph, ignoreNeargood bool) string {
return getNeighbour(i, paragraphs, ignoreNeargood, -1, -1)
}
func getNextNeighbour(i int, paragraphs []*Paragraph, ignoreNeargood bool) string {
return getNeighbour(i, paragraphs, ignoreNeargood, 1, len(paragraphs))
}
func getNeighbour(i int, paragraphs []*Paragraph, ignoreNeargood bool, inc int, boundary int) string {
for i+inc != boundary {
i += inc
var c string = paragraphs[i].Class
if c == "good" || c == "bad" {
return c
}
if c == "neargood" && !ignoreNeargood {
return c
}
}
return "bad"
}