Skip to content

Commit

Permalink
fix: performance optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
kris-dev-hub committed Dec 15, 2023
1 parent 42e21a7 commit b8b0cae
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 48 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ There are around 6 billion unique external backlinks per month in the common cra
- 4GB of RAM is the minimum requirement. Requires 1.5GB of RAM per every next thread for parsing.
- Minimum 50GB of free disk for every segment parsed at the same time.
- MongoDB require minimum 2GB of disc space for every segment. 200MB of ram for every imported segment is optimal.
- lzop installed on the system.

## Alpha Version Disclaimer
This is an alpha version of GlobalLinks and is subject to changes. The software is provided "as is", without warranty of any kind.
Expand Down
50 changes: 34 additions & 16 deletions cmd/importer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ import (

"github.com/kris-dev-hub/globallinks/pkg/commoncrawl"
"github.com/kris-dev-hub/globallinks/pkg/fileutils"

_ "net/http/pprof"
)

const (
savePageData = false // collect and parse page data
lowDiscSpaceMode = true // encrypt tmp files to save disc space during sorting, requires lzop installed
healthCheckMode = true // enable health check api to monitor application on port 3005: http://localhost:3005/health
pprofMode = false // enable pprof api to monitor application on port 6060: http://localhost:6060/debug/pprof/
)

const (
Expand Down Expand Up @@ -54,6 +57,13 @@ type FileLinkCompacted struct {
}

func main() {

Check failure on line 60 in cmd/importer/main.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed (gofumpt)
if pprofMode == true {
go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
}

var err error
var archiveName string
var segmentsToImport []int
Expand Down Expand Up @@ -513,24 +523,32 @@ func compactSegmentData(segment commoncrawl.WatSegment, dataDir commoncrawl.Data
return fmt.Errorf("could not delete WAT processed files: %v", err)
}
}
err = fileutils.DeleteDirectoryIfEmpty(dataDir.TmpDir + "/" + segment.Segment)
if err != nil {
return fmt.Errorf("could not delete tmp directories: %v", err)
}

err = aggressiveCompacting(linkSegmentSorted, linkSegmentCompacted)
if err != nil {
return fmt.Errorf("could not compact file: %v", err)
}
err = os.Remove(linkSegmentSorted)
if err != nil {
return fmt.Errorf("could not delete file: %v", err)
}
if fileutils.FileExists(linkSegmentSorted) {

// save info that segment was finished
err = commoncrawl.UpdateSegmentImportEnd(segmentList, segment.Segment)
if err != nil {
return fmt.Errorf("%v", err)
err = fileutils.DeleteDirectoryIfEmpty(dataDir.TmpDir + "/" + segment.Segment)
if err != nil {
return fmt.Errorf("could not delete tmp directories: %v", err)
}

err = aggressiveCompacting(linkSegmentSorted, linkSegmentCompacted)
if err != nil {
return fmt.Errorf("could not compact file: %v", err)
}
err = os.Remove(linkSegmentSorted)
if err != nil {
return fmt.Errorf("could not delete file: %v", err)
}

// save info that segment was finished
err = commoncrawl.UpdateSegmentImportEnd(segmentList, segment.Segment)
if err != nil {
return fmt.Errorf("%v", err)
}
} else {
if err != nil {
return fmt.Errorf("can't find sorted file!\n")
}
}
}

Expand Down
43 changes: 11 additions & 32 deletions pkg/commoncrawl/wat.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ type DataDir struct {
// saves around 1s per 1M lines on one i5-9300H core
var ipRegex = regexp.MustCompile(`^(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)){3}$`)

var isValidDomainRegex = regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`)

// initialize a map for fast lookups - it will be used to ignore certain domains and extensions
var (
ignoreDomains = map[string]bool{}
Expand Down Expand Up @@ -591,28 +593,8 @@ func verifyRecordQuality(record *URLRecord) bool {

// validateHose - validate host for strange characters and no dots
func validateHost(host string) bool {
if strings.Contains(host, "%") ||
strings.Contains(host, "[") ||
strings.Contains(host, "]") ||
strings.Contains(host, "=") ||
strings.Contains(host, "'") ||
strings.Contains(host, ":") ||
strings.Contains(host, "*") ||
strings.Contains(host, "(") ||
strings.Contains(host, ")") ||
strings.Contains(host, "<") ||
strings.Contains(host, ">") ||
strings.Contains(host, "&") ||
strings.Contains(host, "!") ||
strings.Contains(host, "+") ||
strings.Contains(host, "`") ||
strings.Contains(host, ",") ||
strings.Contains(host, "}") ||
strings.Contains(host, "{") ||
strings.Contains(host, "$") ||
strings.Contains(host, "\"") ||
strings.Contains(host, ":") ||
strings.Contains(host, ";") {

Check failure on line 596 in pkg/commoncrawl/wat.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed (gofumpt)
if strings.ContainsAny(host, "%[]=':*()<>!&+,}{}$\";`") {
return false
}

Expand All @@ -628,12 +610,14 @@ func validateHost(host string) bool {
return true
}

// final verification of domain
// IsValidDomain - final verification of domain
func IsValidDomain(domain string) bool {
// Regular expression to match valid domain characters and rules
// This regex does not cover all possible TLDs and might need modification for specific cases
re := regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`)
return re.MatchString(domain)
// re := regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`)

// moving regex to global variable to make it faster - saved around 10% of time
return isValidDomainRegex.MatchString(domain)
}

// buildURLRecord - build url record from source url, check domain, path, query, etc.
Expand All @@ -651,13 +635,8 @@ func buildURLRecord(sourceURL string, urlRecord *URLRecord) bool {
return false
}

// ignore path with \n
if strings.Contains(parsedURL.Path, "\n") {
return false
}

// ignore path with | char
if strings.Contains(parsedURL.Path, "|") {
// ignore path with \n and | char
if strings.ContainsAny(parsedURL.Path, "\n|") {
return false
}

Expand Down

0 comments on commit b8b0cae

Please sign in to comment.