Blank identifier discarding results; verify intentional ignoring of return values
fileJob.Hash, _ = blake2b.New256(nil)
1// SPDX-License-Identifier: MIT23package processor45import (6 "bytes"7 "hash"8 "runtime/debug"9 "strings"10 "sync"11 "sync/atomic"1213 "golang.org/x/crypto/blake2b"14)1516// The below are used as identifiers for the code state machine17const (18 SBlank int64 = 119 SCode int64 = 220 SComment int64 = 321 SCommentCode int64 = 4 // Indicates comment after code22 SMulticomment int64 = 523 SMulticommentCode int64 = 6 // Indicates multi comment after code24 SMulticommentBlank int64 = 7 // Indicates multi comment ended with blank afterward25 SString int64 = 826 SDocString int64 = 927)2829// SheBang is a global constant for indicating a shebang file header30const SheBang string = "#!"3132// LineType what type of line are processing33type LineType int323435// These are not meant to be CAMEL_CASE but as it us used by an external project we cannot change it36const (37 LINE_BLANK LineType = iota38 LINE_CODE39 LINE_COMMENT40)4142// ByteOrderMarks are taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding43// These indicate that we cannot count the file correctly so we can at least warn the user44var ByteOrderMarks = [][]byte{45 {254, 255}, // UTF-16 BE46 {255, 254}, // UTF-16 LE47 {0, 0, 254, 255}, // UTF-32 BE48 {255, 254, 0, 0}, // UTF-32 LE49 {43, 47, 118, 56}, // UTF-750 {43, 47, 118, 57}, // UTF-751 {43, 47, 118, 43}, // UTF-752 {43, 47, 118, 47}, // UTF-753 {43, 47, 118, 56, 45}, // UTF-754 {247, 100, 76}, // UTF-155 {221, 115, 102, 115}, // UTF-EBCDIC56 {14, 254, 255}, // SCSU57 {251, 238, 40}, // BOCU-158 {132, 49, 149, 51}, // GB-1803059}6061var duplicates = CheckDuplicates{62 hashes: make(map[int64][][]byte),63}6465func checkForMatchSingle(currentByte byte, index int, endPoint int, matches []byte, fileJob *FileJob) bool {66 potentialMatch := true67 if currentByte == matches[0] {68 for j := 0; j < len(matches); j++ {69 if index+j >= endPoint || matches[j] != fileJob.Content[index+j] {70 potentialMatch = false71 break72 }73 }7475 if potentialMatch {76 return true77 }78 }7980 return false81}8283func isWhitespace(currentByte byte) bool {84 if currentByte != ' ' && currentByte != '\t' && currentByte != '\n' && currentByte != '\r' {85 return false86 }8788 return true89}9091func isIdentifierContinue(b byte) bool {92 return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'93}9495func hasNonWhitespaceBefore(content []byte, index int) bool {96 for i := index - 1; i >= 0; i-- {97 if !isWhitespace(content[i]) {98 return true99 }100 }101102 return false103}104105func nextNonWhitespaceIndex(content []byte, index int) int {106 for index < len(content) && isWhitespace(content[index]) {107 index++108 }109110 return index111}112113func hasPostfixExclude(content []byte, index, offsetJump int, excludes [][]byte) bool {114 token := content[index : index+offsetJump]115 for _, exclude := range excludes {116 if len(exclude) < offsetJump || !bytes.Equal(token, exclude[:offsetJump]) {117 continue118 }119120 remaining := exclude[offsetJump:]121 if len(remaining) == 0 {122 return true123 }124125 next := nextNonWhitespaceIndex(content, index+offsetJump)126 if next+len(remaining) > len(content) || !bytes.Equal(content[next:next+len(remaining)], remaining) {127 continue128 }129130 afterExclude := next + len(remaining)131 if isIdentifierContinue(remaining[len(remaining)-1]) {132 return afterExclude == len(content) || !isIdentifierContinue(content[afterExclude])133 }134135 return true136 }137138 return false139}140141func countComplexityPostfix(fileJob *FileJob, index, offsetJump int, postfixExcludes [][]byte) {142 if index == 0 {143 return144 }145146 content := fileJob.Content147 if isWhitespace(content[index-1]) && !hasNonWhitespaceBefore(content, index-1) {148 return149 }150151 if len(postfixExcludes) > 0 && hasPostfixExclude(content, index, offsetJump, postfixExcludes) {152 return153 }154155 fileJob.Complexity++156 fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1157}158159// Check if this file is binary by checking for nul byte and if so bail out160// this is how GNU Grep, git and ripgrep check for binary files161func isBinary(index int, currentByte byte) bool {162 return index < 10000 && !DisableCheckBinary && currentByte == 0163}164165func shouldProcess(currentByte, processBytesMask byte) bool {166 return currentByte&processBytesMask == currentByte167}168169func stateToByteType(state int64) byte {170 switch state {171 case SCode:172 return ByteTypeCode173 case SString:174 return ByteTypeString175 case SComment, SCommentCode, SMulticomment, SMulticommentCode, SMulticommentBlank, SDocString:176 return ByteTypeComment177 default: // SBlank178 return ByteTypeBlank179 }180}181182func resetState(currentState int64) int64 {183 switch currentState {184 case SMulticomment, SMulticommentCode:185 currentState = SMulticomment186 case SString:187 currentState = SString188 default:189 currentState = SBlank190 }191192 return currentState193}194195func stringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64, ignoreEscape bool) (int, int64) {196 // It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here197 // without checking if it is out of bounds first198 for i := index; i < endPoint; i++ {199 index = i200201 if fileJob.ContentByteType != nil {202 fileJob.ContentByteType[i] = ByteTypeString203 }204205 // If we hit a newline, return because we want to count the stats but keep206 // the current state so we end up back in this loop when the outer207 // one calls again208 if fileJob.Content[i] == '\n' {209 return i, currentState210 }211212 is_escaped := false213 // if there is an escape symbol before us, investigate214 if fileJob.Content[i-1] == '\\' {215 num_escapes := 0216 for j := i - 1; j > 0; j-- {217 if fileJob.Content[j] != '\\' {218 break219 }220 num_escapes++221 }222223 // if number of escapes is even, all escapes are themselves escaped224 // otherwise the last escape does escape current string terminator225 if num_escapes%2 != 0 {226 is_escaped = true227 }228 }229230 // If we are in a literal string we want to ignore escapes OR we aren't checking for special ones231 if ignoreEscape || !is_escaped {232 if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {233 return i, SCode234 }235 }236 }237238 return index, currentState239}240241// This is a special state check pretty much only ever used by Python codebases242// but potentially it could be expanded to deal with other types243func docStringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64) (int, int64) {244 // It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here245 // without checking if it is out of bounds first246 for i := index; i < endPoint; i++ {247 index = i248249 if fileJob.ContentByteType != nil {250 fileJob.ContentByteType[i] = ByteTypeComment251 }252253 if fileJob.Content[i] == '\n' {254 return i, currentState255 }256257 if fileJob.Content[i-1] != '\\' {258 if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {259 // So we have hit end of docstring at this point in which case check if only whitespace characters till the next260 // newline and if so we change to a comment otherwise to code261 // need to start the loop after ending definition of docstring, therefore adding the length of the string to262 // the index263 for j := index + len(endString); j <= endPoint; j++ {264 if fileJob.Content[j] == '\n' {265 printDebug("Found newline so docstring is comment")266 return i, SComment267 }268269 if !isWhitespace(fileJob.Content[j]) {270 printDebugF("Found something not whitespace so is code: %s", string(fileJob.Content[j]))271 return i, SCode272 }273 }274275 return i, SCode276 }277 }278 }279280 return index, currentState281}282283func codeState(284 fileJob *FileJob,285 index int,286 endPoint int,287 currentState int64,288 endString []byte,289 endComments [][]byte,290 langFeatures LanguageFeature,291 digest *hash.Hash,292) (int, int64, []byte, [][]byte, bool) {293 // Hacky fix to https://github.com/boyter/scc/issues/181294 if endPoint > len(fileJob.Content) {295 endPoint--296 }297298 for i := index; i < endPoint; i++ {299 curByte := fileJob.Content[i]300 index = i301302 if fileJob.ContentByteType != nil {303 fileJob.ContentByteType[i] = ByteTypeCode304 }305306 if curByte == '\n' {307 return i, currentState, endString, endComments, false308 }309310 if isBinary(i, curByte) {311 fileJob.Binary = true312 return i, currentState, endString, endComments, false313 }314315 if shouldProcess(curByte, langFeatures.ProcessMask) {316 if Duplicates {317 // Technically this is wrong because we skip bytes, so this is not a true318 // hash of the file contents, but for duplicate files it shouldn't matter319 // as both will skip the same way320 digestible := []byte{fileJob.Content[index]}321 (*digest).Write(digestible)322 }323324 switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[i:]); tokenType {325 case TString:326 // If we are in string state then check what sort of string so we know if docstring OR ignoreescape string327 i, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)328329 // It is safe to -1 here as to enter the code state we need to have330 // transitioned from blank to here hence i should always be >= 1331 // This check is to ensure we aren't in a character declaration332 // TODO this should use language features333 if fileJob.Content[i-1] != '\\' {334 currentState = SString335 }336337 return i, currentState, endString, endComments, ignoreEscape338339 case TSlcomment:340 currentState = SCommentCode341 return i, currentState, endString, endComments, false342343 case TMlcomment:344 if langFeatures.Nested || len(endComments) == 0 {345 endComments = append(endComments, endString)346 currentState = SMulticommentCode347 i += offsetJump - 1348349 return i, currentState, endString, endComments, false350 }351352 case TComplexity:353 if index == 0 || isWhitespace(fileJob.Content[index-1]) {354 fileJob.Complexity++355 fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1356 }357358 case TComplexityPostfix:359 countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)360 }361 }362 }363364 return index, currentState, endString, endComments, false365}366367func commentState(fileJob *FileJob, index int, endPoint int, currentState int64, endComments [][]byte, endString []byte, langFeatures LanguageFeature) (int, int64, []byte, [][]byte) {368 for i := index; i < endPoint; i++ {369 curByte := fileJob.Content[i]370 index = i371372 if fileJob.ContentByteType != nil {373 fileJob.ContentByteType[i] = ByteTypeComment374 }375376 if curByte == '\n' {377 return i, currentState, endString, endComments378 }379380 if checkForMatchSingle(curByte, index, endPoint, endComments[len(endComments)-1], fileJob) {381 // set offset jump here382 offsetJump := len(endComments[len(endComments)-1])383 endComments = endComments[:len(endComments)-1]384385 if len(endComments) == 0 {386 // If we started as multiline code switch back to code so we count correctly387 // IE i := 1 /* for the lols */388 // TODO is that required? Might still be required to count correctly389 if currentState == SMulticommentCode {390 currentState = SCode // TODO pointless to change here, just set S_MULTICOMMENT_BLANK391 } else {392 currentState = SMulticommentBlank393 }394 }395396 i += offsetJump - 1397 return i, currentState, endString, endComments398 }399 // Check if we are entering another multiline comment400 // This should come below check for match single as it speeds up processing401 if langFeatures.Nested || len(endComments) == 0 {402 if ok, offsetJump, endString := langFeatures.MultiLineComments.Match(fileJob.Content[i:]); ok != 0 {403 endComments = append(endComments, endString)404 i += offsetJump - 1405406 return i, currentState, endString, endComments407 }408 }409 }410411 return index, currentState, endString, endComments412}413414func blankState(415 fileJob *FileJob,416 index int,417 currentState int64,418 endComments [][]byte,419 endString []byte,420 langFeatures LanguageFeature,421) (int, int64, []byte, [][]byte, bool) {422 switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[index:]); tokenType {423 case TMlcomment:424 if langFeatures.Nested || len(endComments) == 0 {425 endComments = append(endComments, endString)426 currentState = SMulticomment427 index += offsetJump - 1428 if fileJob.ContentByteType != nil {429 fileJob.ContentByteType[index] = ByteTypeComment430 }431 return index, currentState, endString, endComments, false432 }433434 case TSlcomment:435 currentState = SComment436 if fileJob.ContentByteType != nil {437 fileJob.ContentByteType[index] = ByteTypeComment438 }439 return index, currentState, endString, endComments, false440441 case TString:442 index, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)443444 for _, v := range langFeatures.Quotes {445 if v.End == string(endString) && v.DocString {446 currentState = SDocString447 if fileJob.ContentByteType != nil {448 fileJob.ContentByteType[index] = ByteTypeComment449 }450 return index, currentState, endString, endComments, ignoreEscape451 }452 }453 currentState = SString454 if fileJob.ContentByteType != nil {455 fileJob.ContentByteType[index] = ByteTypeString456 }457 return index, currentState, endString, endComments, ignoreEscape458459 case TComplexity:460 currentState = SCode461 if fileJob.ContentByteType != nil {462 fileJob.ContentByteType[index] = ByteTypeCode463 }464 if index == 0 || isWhitespace(fileJob.Content[index-1]) {465 fileJob.Complexity++466 fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1467 }468469 case TComplexityPostfix:470 currentState = SCode471 if fileJob.ContentByteType != nil {472 fileJob.ContentByteType[index] = ByteTypeCode473 }474 countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)475476 default:477 currentState = SCode478 if fileJob.ContentByteType != nil {479 fileJob.ContentByteType[index] = ByteTypeCode480 }481 }482483 return index, currentState, endString, endComments, false484}485486// Some languages such as C# have quoted strings like @"\" where no escape character is required487// this checks if there is one so we can cater for these cases488func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index int) (int, bool) {489 ignoreEscape := false490491 // loop over the string states and if we have the special flag match, and if so we need to ensure we can handle them492 for i := 0; i < len(langFeatures.Quotes); i++ {493 if langFeatures.Quotes[i].DocString || langFeatures.Quotes[i].IgnoreEscape {494 // If so we need to check if where we are falls into these conditions495 isMatch := true496 for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {497 if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {498 isMatch = false499 break500 }501 }502503 // If we have a match then jump ahead enough so we don't pick it up again for cases like @"504 if isMatch {505 ignoreEscape = true506 index = index + len(langFeatures.Quotes[i].Start)507 }508 }509 }510511 return index, ignoreEscape512}513514// CountStats will process the fileJob515// If the file contains anything even just a newline its line count should be >= 1.516// If the file has a size of 0 its line count should be 0.517// Newlines belong to the line they started on so a file of \n means only 1 line518// This is the 'hot' path for the application and needs to be as fast as possible519func CountStats(fileJob *FileJob) {520 // For determining duplicates we need the below. The reason for creating521 // the byte array here is to avoid GC pressure. MD5 is in the standard library522 // and is fast enough to not warrant murmur3 hashing. No need to be523 // crypto secure here either so no need to eat the performance cost of a better524 // hash method525 if Duplicates {526 fileJob.Hash, _ = blake2b.New256(nil)527 }528529 // If the file has a length of 0 it is empty then we say it has no lines530 if fileJob.Bytes == 0 {531 fileJob.Lines = 0532 return533 }534535 LanguageFeaturesMutex.Lock()536 langFeatures := LanguageFeatures[fileJob.Language]537 LanguageFeaturesMutex.Unlock()538539 if langFeatures.Complexity == nil {540 langFeatures.Complexity = &Trie{}541 }542 if langFeatures.SingleLineComments == nil {543 langFeatures.SingleLineComments = &Trie{}544 }545 if langFeatures.MultiLineComments == nil {546 langFeatures.MultiLineComments = &Trie{}547 }548 if langFeatures.Strings == nil {549 langFeatures.Strings = &Trie{}550 }551 if langFeatures.Tokens == nil {552 langFeatures.Tokens = &Trie{}553 }554555 endPoint := int(fileJob.Bytes - 1)556 currentState := SBlank557 endComments := [][]byte{}558 endString := []byte{}559560 // TODO needs to be set via langFeatures.Quotes[0].IgnoreEscape for the matching feature561 ignoreEscape := false562 fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)563564 if fileJob.ClassifyContent {565 fileJob.ContentByteType = make([]byte, fileJob.Bytes)566 }567568 for index := checkBomSkip(fileJob); index < int(fileJob.Bytes); index++ {569 if fileJob.ContentByteType != nil {570 fileJob.ContentByteType[index] = stateToByteType(currentState)571 }572573 // Based on our current state determine if the state should change by checking574 // what the character is. The below is very CPU bound so need to be careful if575 // changing anything in here and profile/measure afterwards!576 // NB that the order of the if statements matters and has been set to what in benchmarks is most efficient577 if !isWhitespace(fileJob.Content[index]) {578579 switch currentState {580 case SCode:581 index, currentState, endString, endComments, ignoreEscape = codeState(582 fileJob,583 index,584 endPoint,585 currentState,586 endString,587 endComments,588 langFeatures,589 &fileJob.Hash,590 )591 case SString:592 index, currentState = stringState(fileJob, index, endPoint, endString, currentState, ignoreEscape)593 case SDocString:594 // For a docstring we can either move into blank in which case we count it as a docstring595 // or back into code in which case it should be counted as code596 index, currentState = docStringState(fileJob, index, endPoint, endString, currentState)597 case SMulticomment, SMulticommentCode:598 index, currentState, endString, endComments = commentState(599 fileJob,600 index,601 endPoint,602 currentState,603 endComments,604 endString,605 langFeatures,606 )607 case SBlank, SMulticommentBlank:608 // From blank we can move into comment, move into a multiline comment609 // or move into code but we can only do one.610 index, currentState, endString, endComments, ignoreEscape = blankState(611 fileJob,612 index,613 currentState,614 endComments,615 endString,616 langFeatures,617 )618 }619 }620621 // We shouldn't normally need this, but unclosed strings or comments622 // might leave the index past the end of the file when we reach this623 // point.624 if index >= len(fileJob.Content) {625 return626 }627628 // Only check the first 10000 characters for null bytes indicating a binary file629 // and if we find it then we return otherwise carry on and ignore binary markers630 if index < 10000 && fileJob.Binary {631 return632 }633634 // This means the end of processing the line so calculate the stats according to what state635 // we are currently in636 if fileJob.Content[index] == '\n' || index >= endPoint {637 fileJob.Lines++638 fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)639640 if NoLarge && fileJob.Lines >= LargeLineCount {641 // Save memory by unsetting the content as we no longer require it642 fileJob.Content = nil643 return644 }645646 switch currentState {647 case SCode, SString, SCommentCode, SMulticommentCode:648 fileJob.Code++649 currentState = resetState(currentState)650 if fileJob.Callback != nil {651 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_CODE) {652 return653 }654 }655 if Trace {656 // Don't remove the outside if-statements, for performance657 printTraceF("%s line %d ended with state: %d: counted as code", fileJob.Location, fileJob.Lines, currentState)658 }659 case SComment, SMulticomment, SMulticommentBlank:660 fileJob.Comment++661 currentState = resetState(currentState)662 if fileJob.Callback != nil {663 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {664 return665 }666 }667 if Trace {668 // Same as above669 printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)670 }671 case SBlank:672 fileJob.Blank++673 if fileJob.Callback != nil {674 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_BLANK) {675 return676 }677 }678 if Trace {679 // Same as above680 printTraceF("%s line %d ended with state: %d: counted as blank", fileJob.Location, fileJob.Lines, currentState)681 }682 case SDocString:683 fileJob.Comment++684 if fileJob.Callback != nil {685 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {686 return687 }688 }689 if Trace {690 // Same as above691 printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)692 }693 }694 }695 }696697 if UlocMode {698 uloc := map[string]struct{}{}699 for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {700 uloc[l] = struct{}{}701 }702 fileJob.Uloc = len(uloc)703 }704705 if MaxMean {706 for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {707 fileJob.LineLength = append(fileJob.LineLength, len(l))708 }709 }710711 isGenerated := false712713 if Generated {714 headLen := min(1000, len(fileJob.Content))715 head := bytes.ToLower(fileJob.Content[0:headLen])716 for _, marker := range GeneratedMarkers {717 if bytes.Contains(head, bytes.ToLower([]byte(marker))) {718 fileJob.Generated = true719 fileJob.Language = fileJob.Language + " (gen)"720 isGenerated = true721 printWarnF("%s identified as isGenerated with heading comment", fileJob.Filename)722 break723 }724 }725 }726727 // check if 0 as well to avoid divide by zero https://github.com/boyter/scc/issues/223728 if !isGenerated && Minified && fileJob.Lines != 0 {729 avgLineByteCount := len(fileJob.Content) / int(fileJob.Lines)730 minifiedGeneratedCheck(avgLineByteCount, fileJob)731 }732733 fileJob.ComplexityLine = fileJob.ComplexityLine[:fileJob.Lines]734}735736func minifiedGeneratedCheck(avgLineByteCount int, fileJob *FileJob) {737 if avgLineByteCount >= MinifiedGeneratedLineByteLength {738 fileJob.Minified = true739 fileJob.Language = fileJob.Language + " (min)"740 printWarnF("%s identified as minified/generated with average line byte length of %d >= %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)741 } else {742 printDebugF("%s not identified as minified/generated with average line byte length of %d < %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)743 }744}745746// Check if we have any Byte Order Marks (BOM) in front of the file747func checkBomSkip(fileJob *FileJob) int {748 // UTF-8 BOM which if detected we should skip the BOM as we can then count correctly749 // []byte is UTF-8 BOM taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding750 if bytes.HasPrefix(fileJob.Content, []byte{239, 187, 191}) {751 printWarnF("UTF-8 BOM found for file %s skipping 3 bytes", fileJob.Filename)752 return 3753 }754755 // If we have one of the other BOM then we might not be able to count correctly so if verbose let the user know756 if Verbose {757 for _, v := range ByteOrderMarks {758 if bytes.HasPrefix(fileJob.Content, v) {759 printWarnF("BOM found for file %s indicating it is not ASCII/UTF-8 and may be counted incorrectly or ignored as a binary file", fileJob.Filename)760 }761 }762 }763764 return 0765}766767// Reads and processes files from input chan in parallel, and sends results to768// output chan769func (ctx processorContext) fileProcessorWorker(input chan *FileJob, output chan *FileJob) {770 var startTime int64771 var fileCount int64772 var gcEnabled int64773 var wg sync.WaitGroup774775 for i := 0; i < FileProcessJobWorkers; i++ {776 wg.Add(1)777 go func() {778 reader := NewFileReader()779780 for job := range input {781 atomic.CompareAndSwapInt64(&startTime, 0, makeTimestampMilli())782783 loc := job.Location784 if job.Symlocation != "" {785 loc = job.Symlocation786 }787788 fileStartTime := makeTimestampNano()789 content, err := reader.ReadFile(loc, int(job.Bytes))790 atomic.AddInt64(&fileCount, 1)791792 if atomic.LoadInt64(&gcEnabled) == 0 && atomic.LoadInt64(&fileCount) >= int64(GcFileCount) {793 debug.SetGCPercent(gcPercent)794 atomic.AddInt64(&gcEnabled, 1)795 printWarn("read file limit exceeded GC re-enabled")796 }797798 printTraceF("nanoseconds read into memory: %s: %d", job.Location, makeTimestampNano()-fileStartTime)799800 if err == nil {801 job.Content = content802 if ctx.processFile(job) {803 output <- job804 }805 } else {806 printWarnF("error reading: %s %s", job.Location, err)807 }808 }809810 wg.Done()811 }()812 }813814 go func() {815 wg.Wait()816 close(output)817818 printDebugF("milliseconds reading files into memory: %d", makeTimestampMilli()-startTime)819 }()820}821822// Process a single file823// File must have been read to job.Content already824func (ctx processorContext) processFile(job *FileJob) bool {825 fileStartTime := makeTimestampNano()826827 contents := job.Content828829 // Needs to always run to ensure the language is set830 job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)831832 remapped := false833 if len(ctx.remap.all) != 0 {834 ctx.hardRemapLanguage(job)835 }836837 // If the type is #! we should check to see if we can identify838 if job.Language == SheBang {839 if len(ctx.remap.unknown) != 0 {840 remapped = ctx.unknownRemapLanguage(job)841 }842843 // if we didn't remap we then want to see if it's a #! map844 if !remapped {845 cutoff := min(200, len(contents))846847 lang, err := DetectSheBang(string(contents[:cutoff]))848 if err != nil {849 printWarnF("unable to determine #! language for %s", job.Location)850 return false851 }852853 printWarnF("detected #! %s for %s", lang, job.Location)854 job.Language = lang855 LoadLanguageFeature(lang)856 }857 }858859 CountStats(job)860861 if Duplicates {862 duplicates.mux.Lock()863 jobHash := job.Hash.Sum(nil)864 if duplicates.Check(job.Bytes, jobHash) {865 printWarnF("skipping duplicate file: %s", job.Location)866 duplicates.mux.Unlock()867 return false868 }869870 duplicates.Add(job.Bytes, jobHash)871 duplicates.mux.Unlock()872 }873874 if IgnoreMinified && job.Minified {875 printWarnF("skipping minified file: %s", job.Location)876 return false877 }878879 if IgnoreGenerated && job.Generated {880 printWarnF("skipping generated file: %s", job.Location)881 return false882 }883884 if NoLarge && job.Lines >= LargeLineCount {885 printWarnF("skipping large file due to line length: %s", job.Location)886 return false887 }888889 printTraceF("nanoseconds process: %s: %d", job.Location, makeTimestampNano()-fileStartTime)890891 if job.Binary {892 printWarnF("skipping file identified as binary: %s", job.Location)893 return false894 }895896 // This needs to be at the end so we can ensure duplicate detection et.al run first897 // avoiding inflating the counts898 if UlocMode {899 ulocMutex.Lock()900901 for l := range strings.SplitSeq(strings.TrimRight(string(job.Content), "\n"), "\n") {902 ulocGlobalCount[l] = struct{}{}903904 _, ok := ulocLanguageCount[job.Language]905 if !ok {906 ulocLanguageCount[job.Language] = map[string]struct{}{}907 }908 ulocLanguageCount[job.Language][l] = struct{}{}909 }910 ulocMutex.Unlock()911 }912913 return true914}915916func (ctx processorContext) hardRemapLanguage(job *FileJob) bool {917 remapped := false918 cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look919920 for _, rule := range ctx.remap.all {921 if bytes.Contains(job.Content[:cutoff], rule.pattern) {922 job.Language = rule.language923 remapped = true924 printWarnF("hard remapping: %s to %s", job.Location, job.Language)925 }926 }927928 return remapped929}930931func (ctx processorContext) unknownRemapLanguage(job *FileJob) bool {932 remapped := false933 cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look934935 for _, rule := range ctx.remap.unknown {936 if bytes.Contains(job.Content[:cutoff], rule.pattern) {937 job.Language = rule.language938 remapped = true939 printWarnF("unknown remapping: %s to %s", job.Location, job.Language)940 }941 }942943 return remapped944}
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.