diff --git a/cmd/importer/main.go b/cmd/importer/main.go index 636566b..7d78e2f 100644 --- a/cmd/importer/main.go +++ b/cmd/importer/main.go @@ -3,6 +3,7 @@ package main import ( "bufio" "fmt" + "github.com/kris-dev-hub/globallinks/pkg/healthcheck" "log" "net/http" "os" @@ -14,8 +15,6 @@ import ( "sync" "time" - "github.com/kris-dev-hub/globallinks/pkg/healthcheck" - "github.com/klauspost/compress/gzip" "github.com/kris-dev-hub/globallinks/pkg/commoncrawl" @@ -204,11 +203,6 @@ func importSegment(segment commoncrawl.WatSegment, dataDir commoncrawl.DataDir, for _, watFile := range segment.WatFiles { - // sleep between WAT files to avoid common crawl transfer limitation - if sleepBetweenWat > 0 { - time.Sleep(time.Duration(sleepBetweenWat) * time.Second) - } - // ignore imported files if watFile.Imported != nil { continue @@ -254,6 +248,11 @@ func importSegment(segment commoncrawl.WatSegment, dataDir commoncrawl.DataDir, panic(fmt.Sprintf("Failed to create file: %v", err)) } + // sleep between WAT files to avoid common crawl transfer limitation + if sleepBetweenWat > 0 { + time.Sleep(time.Duration(sleepBetweenWat) * time.Second) + } + wg.Add(1) // Before starting the goroutine, we insert an empty struct into the guard channel. // If the channel is already full (meaning we have 'maxGoroutines' goroutines running),