diff --git a/README.md b/README.md index 6150eb5..ae1af94 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,7 @@ There are around 6 billion unique external backlinks per month in the common cra - 4GB of RAM is the minimum requirement. Requires 1.5GB of RAM per every next thread for parsing. - Minimum 50GB of free disk for every segment parsed at the same time. - MongoDB require minimum 2GB of disc space for every segment. 200MB of ram for every imported segment is optimal. +- lzop installed on the system. ## Alpha Version Disclaimer This is an alpha version of GlobalLinks and is subject to changes. The software is provided "as is", without warranty of any kind. diff --git a/cmd/importer/main.go b/cmd/importer/main.go index 04a2bf5..ceb114e 100644 --- a/cmd/importer/main.go +++ b/cmd/importer/main.go @@ -19,12 +19,15 @@ import ( "github.com/kris-dev-hub/globallinks/pkg/commoncrawl" "github.com/kris-dev-hub/globallinks/pkg/fileutils" + + _ "net/http/pprof" ) const ( savePageData = false // collect and parse page data lowDiscSpaceMode = true // encrypt tmp files to save disc space during sorting, requires lzop installed healthCheckMode = true // enable health check api to monitor application on port 3005: http://localhost:3005/health + pprofMode = false // enable pprof api to monitor application on port 6060: http://localhost:6060/debug/pprof/ ) const ( @@ -54,6 +57,13 @@ type FileLinkCompacted struct { } func main() { + + if pprofMode == true { + go func() { + log.Println(http.ListenAndServe("localhost:6060", nil)) + }() + } + var err error var archiveName string var segmentsToImport []int @@ -513,24 +523,32 @@ func compactSegmentData(segment commoncrawl.WatSegment, dataDir commoncrawl.Data return fmt.Errorf("could not delete WAT processed files: %v", err) } } - err = fileutils.DeleteDirectoryIfEmpty(dataDir.TmpDir + "/" + segment.Segment) - if err != nil { - return fmt.Errorf("could not delete tmp directories: %v", err) - } - err = aggressiveCompacting(linkSegmentSorted, linkSegmentCompacted) - if err != nil { - return fmt.Errorf("could not compact file: %v", err) - } - err = os.Remove(linkSegmentSorted) - if err != nil { - return fmt.Errorf("could not delete file: %v", err) - } + if fileutils.FileExists(linkSegmentSorted) { - // save info that segment was finished - err = commoncrawl.UpdateSegmentImportEnd(segmentList, segment.Segment) - if err != nil { - return fmt.Errorf("%v", err) + err = fileutils.DeleteDirectoryIfEmpty(dataDir.TmpDir + "/" + segment.Segment) + if err != nil { + return fmt.Errorf("could not delete tmp directories: %v", err) + } + + err = aggressiveCompacting(linkSegmentSorted, linkSegmentCompacted) + if err != nil { + return fmt.Errorf("could not compact file: %v", err) + } + err = os.Remove(linkSegmentSorted) + if err != nil { + return fmt.Errorf("could not delete file: %v", err) + } + + // save info that segment was finished + err = commoncrawl.UpdateSegmentImportEnd(segmentList, segment.Segment) + if err != nil { + return fmt.Errorf("%v", err) + } + } else { + if err != nil { + return fmt.Errorf("can't find sorted file!\n") + } } } diff --git a/pkg/commoncrawl/wat.go b/pkg/commoncrawl/wat.go index 2223b7e..74dca33 100644 --- a/pkg/commoncrawl/wat.go +++ b/pkg/commoncrawl/wat.go @@ -122,6 +122,8 @@ type DataDir struct { // saves around 1s per 1M lines on one i5-9300H core var ipRegex = regexp.MustCompile(`^(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)){3}$`) +var isValidDomainRegex = regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`) + // initialize a map for fast lookups - it will be used to ignore certain domains and extensions var ( ignoreDomains = map[string]bool{} @@ -591,28 +593,8 @@ func verifyRecordQuality(record *URLRecord) bool { // validateHose - validate host for strange characters and no dots func validateHost(host string) bool { - if strings.Contains(host, "%") || - strings.Contains(host, "[") || - strings.Contains(host, "]") || - strings.Contains(host, "=") || - strings.Contains(host, "'") || - strings.Contains(host, ":") || - strings.Contains(host, "*") || - strings.Contains(host, "(") || - strings.Contains(host, ")") || - strings.Contains(host, "<") || - strings.Contains(host, ">") || - strings.Contains(host, "&") || - strings.Contains(host, "!") || - strings.Contains(host, "+") || - strings.Contains(host, "`") || - strings.Contains(host, ",") || - strings.Contains(host, "}") || - strings.Contains(host, "{") || - strings.Contains(host, "$") || - strings.Contains(host, "\"") || - strings.Contains(host, ":") || - strings.Contains(host, ";") { + + if strings.ContainsAny(host, "%[]=':*()<>!&+,}{}$\";`") { return false } @@ -628,12 +610,14 @@ func validateHost(host string) bool { return true } -// final verification of domain +// IsValidDomain - final verification of domain func IsValidDomain(domain string) bool { // Regular expression to match valid domain characters and rules // This regex does not cover all possible TLDs and might need modification for specific cases - re := regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`) - return re.MatchString(domain) + // re := regexp.MustCompile(`^(?i)([a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}$`) + + // moving regex to global variable to make it faster - saved around 10% of time + return isValidDomainRegex.MatchString(domain) } // buildURLRecord - build url record from source url, check domain, path, query, etc. @@ -651,13 +635,8 @@ func buildURLRecord(sourceURL string, urlRecord *URLRecord) bool { return false } - // ignore path with \n - if strings.Contains(parsedURL.Path, "\n") { - return false - } - - // ignore path with | char - if strings.Contains(parsedURL.Path, "|") { + // ignore path with \n and | char + if strings.ContainsAny(parsedURL.Path, "\n|") { return false }