Blank identifier discarding results; verify intentional ignoring of return values
fileJob.Hash, _ = blake2b.New256(nil)
1// SPDX-License-Identifier: MIT23package processor45import (6 "bytes"7 "hash"8 "runtime/debug"9 "strings"10 "sync"11 "sync/atomic"1213 "golang.org/x/crypto/blake2b"14)1516// The below are used as identifiers for the code state machine17const (18 SBlank int64 = 119 SCode int64 = 220 SComment int64 = 321 SCommentCode int64 = 4 // Indicates comment after code22 SMulticomment int64 = 523 SMulticommentCode int64 = 6 // Indicates multi comment after code24 SMulticommentBlank int64 = 7 // Indicates multi comment ended with blank afterward25 SString int64 = 826 SDocString int64 = 927)2829// SheBang is a global constant for indicating a shebang file header30const SheBang string = "#!"3132// LineType what type of line are processing33type LineType int323435// These are not meant to be CAMEL_CASE but as it us used by an external project we cannot change it36const (37 LINE_BLANK LineType = iota38 LINE_CODE39 LINE_COMMENT40)4142// ByteOrderMarks are taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding43// These indicate that we cannot count the file correctly so we can at least warn the user44var ByteOrderMarks = [][]byte{45 {254, 255}, // UTF-16 BE46 {255, 254}, // UTF-16 LE47 {0, 0, 254, 255}, // UTF-32 BE48 {255, 254, 0, 0}, // UTF-32 LE49 {43, 47, 118, 56}, // UTF-750 {43, 47, 118, 57}, // UTF-751 {43, 47, 118, 43}, // UTF-752 {43, 47, 118, 47}, // UTF-753 {43, 47, 118, 56, 45}, // UTF-754 {247, 100, 76}, // UTF-155 {221, 115, 102, 115}, // UTF-EBCDIC56 {14, 254, 255}, // SCSU57 {251, 238, 40}, // BOCU-158 {132, 49, 149, 51}, // GB-1803059}6061var duplicates = CheckDuplicates{62 hashes: make(map[int64][][]byte),63}6465func checkForMatchSingle(currentByte byte, index int, endPoint int, matches []byte, fileJob *FileJob) bool {66 potentialMatch := true67 if currentByte == matches[0] {68 for j := range matches {69 if index+j >= endPoint || matches[j] != fileJob.Content[index+j] {70 potentialMatch = false71 break72 }73 }7475 if potentialMatch {76 return true77 }78 }7980 return false81}8283func isWhitespace(currentByte byte) bool {84 if currentByte != ' ' && currentByte != '\t' && currentByte != '\n' && currentByte != '\r' {85 return false86 }8788 return true89}9091func isIdentifierContinue(b byte) bool {92 return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'93}9495func hasNonWhitespaceBefore(content []byte, index int) bool {96 for i := index - 1; i >= 0; i-- {97 if !isWhitespace(content[i]) {98 return true99 }100 }101102 return false103}104105func nextNonWhitespaceIndex(content []byte, index int) int {106 for index < len(content) && isWhitespace(content[index]) {107 index++108 }109110 return index111}112113func hasPostfixExclude(content []byte, index, offsetJump int, excludes [][]byte) bool {114 token := content[index : index+offsetJump]115 for _, exclude := range excludes {116 if len(exclude) < offsetJump || !bytes.Equal(token, exclude[:offsetJump]) {117 continue118 }119120 remaining := exclude[offsetJump:]121 if len(remaining) == 0 {122 return true123 }124125 next := nextNonWhitespaceIndex(content, index+offsetJump)126 if next+len(remaining) > len(content) || !bytes.Equal(content[next:next+len(remaining)], remaining) {127 continue128 }129130 afterExclude := next + len(remaining)131 if isIdentifierContinue(remaining[len(remaining)-1]) {132 return afterExclude == len(content) || !isIdentifierContinue(content[afterExclude])133 }134135 return true136 }137138 return false139}140141func countComplexityPostfix(fileJob *FileJob, index, offsetJump int, postfixExcludes [][]byte) {142 if index == 0 {143 return144 }145146 content := fileJob.Content147 if isWhitespace(content[index-1]) && !hasNonWhitespaceBefore(content, index-1) {148 return149 }150151 if len(postfixExcludes) > 0 && hasPostfixExclude(content, index, offsetJump, postfixExcludes) {152 return153 }154155 fileJob.Complexity++156 fileJob.bumpComplexityLine()157}158159// bumpComplexityLine adds one complexity tick to the line currently being160// counted. No-op when TrackComplexityLines is off — ComplexityLine is left161// empty by CountStats in that case, so there is no slot to bump.162func (fileJob *FileJob) bumpComplexityLine() {163 if n := len(fileJob.ComplexityLine); n > 0 {164 fileJob.ComplexityLine[n-1]++165 }166}167168// Check if this file is binary by checking for nul byte and if so bail out169// this is how GNU Grep, git and ripgrep check for binary files170func isBinary(index int, currentByte byte) bool {171 return index < 10000 && !DisableCheckBinary && currentByte == 0172}173174func shouldProcess(currentByte, processBytesMask byte) bool {175 return currentByte&processBytesMask == currentByte176}177178func stateToByteType(state int64) byte {179 switch state {180 case SCode:181 return ByteTypeCode182 case SString:183 return ByteTypeString184 case SComment, SCommentCode, SMulticomment, SMulticommentCode, SMulticommentBlank, SDocString:185 return ByteTypeComment186 default: // SBlank187 return ByteTypeBlank188 }189}190191func resetState(currentState int64) int64 {192 switch currentState {193 case SMulticomment, SMulticommentCode:194 currentState = SMulticomment195 case SString:196 currentState = SString197 default:198 currentState = SBlank199 }200201 return currentState202}203204func stringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64, ignoreEscape bool) (int, int64) {205 // It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here206 // without checking if it is out of bounds first207 for i := index; i < endPoint; i++ {208 index = i209210 if fileJob.ContentByteType != nil {211 fileJob.ContentByteType[i] = ByteTypeString212 }213214 // If we hit a newline, return because we want to count the stats but keep215 // the current state so we end up back in this loop when the outer216 // one calls again217 if fileJob.Content[i] == '\n' {218 return i, currentState219 }220221 is_escaped := false222 // if there is an escape symbol before us, investigate223 if fileJob.Content[i-1] == '\\' {224 num_escapes := 0225 for j := i - 1; j > 0; j-- {226 if fileJob.Content[j] != '\\' {227 break228 }229 num_escapes++230 }231232 // if number of escapes is even, all escapes are themselves escaped233 // otherwise the last escape does escape current string terminator234 if num_escapes%2 != 0 {235 is_escaped = true236 }237 }238239 // If we are in a literal string we want to ignore escapes OR we aren't checking for special ones240 if ignoreEscape || !is_escaped {241 if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {242 return i, SCode243 }244 }245 }246247 return index, currentState248}249250// This is a special state check pretty much only ever used by Python codebases251// but potentially it could be expanded to deal with other types252func docStringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64) (int, int64) {253 // It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here254 // without checking if it is out of bounds first255 for i := index; i < endPoint; i++ {256 index = i257258 if fileJob.ContentByteType != nil {259 fileJob.ContentByteType[i] = ByteTypeComment260 }261262 if fileJob.Content[i] == '\n' {263 return i, currentState264 }265266 if fileJob.Content[i-1] != '\\' {267 if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {268 // So we have hit end of docstring at this point in which case check if only whitespace characters till the next269 // newline and if so we change to a comment otherwise to code270 // need to start the loop after ending definition of docstring, therefore adding the length of the string to271 // the index272 for j := index + len(endString); j <= endPoint; j++ {273 if fileJob.Content[j] == '\n' {274 printDebug("Found newline so docstring is comment")275 return i, SComment276 }277278 if !isWhitespace(fileJob.Content[j]) {279 printDebugF("Found something not whitespace so is code: %s", string(fileJob.Content[j]))280 return i, SCode281 }282 }283284 return i, SCode285 }286 }287 }288289 return index, currentState290}291292func codeState(293 fileJob *FileJob,294 index int,295 endPoint int,296 currentState int64,297 endString []byte,298 endComments [][]byte,299 langFeatures LanguageFeature,300 digest *hash.Hash,301) (int, int64, []byte, [][]byte, bool) {302 // Hacky fix to https://github.com/boyter/scc/issues/181303 if endPoint > len(fileJob.Content) {304 endPoint--305 }306307 for i := index; i < endPoint; i++ {308 curByte := fileJob.Content[i]309 index = i310311 if fileJob.ContentByteType != nil {312 fileJob.ContentByteType[i] = ByteTypeCode313 }314315 if curByte == '\n' {316 return i, currentState, endString, endComments, false317 }318319 if isBinary(i, curByte) {320 fileJob.Binary = true321 return i, currentState, endString, endComments, false322 }323324 if shouldProcess(curByte, langFeatures.ProcessMask) {325 if Duplicates {326 // Technically this is wrong because we skip bytes, so this is not a true327 // hash of the file contents, but for duplicate files it shouldn't matter328 // as both will skip the same way329 digestible := []byte{fileJob.Content[index]}330 (*digest).Write(digestible)331 }332333 switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[i:]); tokenType {334 case TString:335 // If we are in string state then check what sort of string so we know if docstring OR ignoreescape string336 i, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)337338 // It is safe to -1 here as to enter the code state we need to have339 // transitioned from blank to here hence i should always be >= 1340 // This check is to ensure we aren't in a character declaration341 // TODO this should use language features342 if fileJob.Content[i-1] != '\\' {343 currentState = SString344 }345346 return i, currentState, endString, endComments, ignoreEscape347348 case TSlcomment:349 currentState = SCommentCode350 return i, currentState, endString, endComments, false351352 case TMlcomment:353 if langFeatures.Nested || len(endComments) == 0 {354 endComments = append(endComments, endString)355 currentState = SMulticommentCode356 i += offsetJump - 1357358 return i, currentState, endString, endComments, false359 }360361 case TComplexity:362 if index == 0 || isWhitespace(fileJob.Content[index-1]) {363 fileJob.Complexity++364 fileJob.bumpComplexityLine()365 }366367 case TComplexityPostfix:368 countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)369 }370 }371 }372373 return index, currentState, endString, endComments, false374}375376func commentState(fileJob *FileJob, index int, endPoint int, currentState int64, endComments [][]byte, endString []byte, langFeatures LanguageFeature) (int, int64, []byte, [][]byte) {377 for i := index; i < endPoint; i++ {378 curByte := fileJob.Content[i]379 index = i380381 if fileJob.ContentByteType != nil {382 fileJob.ContentByteType[i] = ByteTypeComment383 }384385 if curByte == '\n' {386 return i, currentState, endString, endComments387 }388389 if checkForMatchSingle(curByte, index, endPoint, endComments[len(endComments)-1], fileJob) {390 // set offset jump here391 offsetJump := len(endComments[len(endComments)-1])392 endComments = endComments[:len(endComments)-1]393394 if len(endComments) == 0 {395 // If we started as multiline code switch back to code so we count correctly396 // IE i := 1 /* for the lols */397 // TODO is that required? Might still be required to count correctly398 if currentState == SMulticommentCode {399 currentState = SCode // TODO pointless to change here, just set S_MULTICOMMENT_BLANK400 } else {401 currentState = SMulticommentBlank402 }403 }404405 i += offsetJump - 1406 return i, currentState, endString, endComments407 }408 // Check if we are entering another multiline comment409 // This should come below check for match single as it speeds up processing410 if langFeatures.Nested || len(endComments) == 0 {411 if ok, offsetJump, endString := langFeatures.MultiLineComments.Match(fileJob.Content[i:]); ok != 0 {412 endComments = append(endComments, endString)413 i += offsetJump - 1414415 return i, currentState, endString, endComments416 }417 }418 }419420 return index, currentState, endString, endComments421}422423func blankState(424 fileJob *FileJob,425 index int,426 currentState int64,427 endComments [][]byte,428 endString []byte,429 langFeatures LanguageFeature,430) (int, int64, []byte, [][]byte, bool) {431 switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[index:]); tokenType {432 case TMlcomment:433 if langFeatures.Nested || len(endComments) == 0 {434 endComments = append(endComments, endString)435 currentState = SMulticomment436 index += offsetJump - 1437 if fileJob.ContentByteType != nil {438 fileJob.ContentByteType[index] = ByteTypeComment439 }440 return index, currentState, endString, endComments, false441 }442443 case TSlcomment:444 currentState = SComment445 if fileJob.ContentByteType != nil {446 fileJob.ContentByteType[index] = ByteTypeComment447 }448 return index, currentState, endString, endComments, false449450 case TString:451 index, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)452453 for _, v := range langFeatures.Quotes {454 if v.End == string(endString) && v.DocString {455 currentState = SDocString456 if fileJob.ContentByteType != nil {457 fileJob.ContentByteType[index] = ByteTypeComment458 }459 return index, currentState, endString, endComments, ignoreEscape460 }461 }462 currentState = SString463 if fileJob.ContentByteType != nil {464 fileJob.ContentByteType[index] = ByteTypeString465 }466 return index, currentState, endString, endComments, ignoreEscape467468 case TComplexity:469 currentState = SCode470 if fileJob.ContentByteType != nil {471 fileJob.ContentByteType[index] = ByteTypeCode472 }473 if index == 0 || isWhitespace(fileJob.Content[index-1]) {474 fileJob.Complexity++475 fileJob.bumpComplexityLine()476 }477478 case TComplexityPostfix:479 currentState = SCode480 if fileJob.ContentByteType != nil {481 fileJob.ContentByteType[index] = ByteTypeCode482 }483 countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)484485 default:486 currentState = SCode487 if fileJob.ContentByteType != nil {488 fileJob.ContentByteType[index] = ByteTypeCode489 }490 }491492 return index, currentState, endString, endComments, false493}494495// Some languages such as C# have quoted strings like @"\" where no escape character is required496// this checks if there is one so we can cater for these cases497func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index int) (int, bool) {498 ignoreEscape := false499500 // loop over the string states and if we have the special flag match, and if so we need to ensure we can handle them501 for i := 0; i < len(langFeatures.Quotes); i++ {502 if langFeatures.Quotes[i].DocString || langFeatures.Quotes[i].IgnoreEscape {503 // If so we need to check if where we are falls into these conditions504 isMatch := true505 for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {506 if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {507 isMatch = false508 break509 }510 }511512 // If we have a match then jump ahead enough so we don't pick it up again for cases like @"513 if isMatch {514 ignoreEscape = true515 index = index + len(langFeatures.Quotes[i].Start)516 }517 }518 }519520 return index, ignoreEscape521}522523// CountStats will process the fileJob524// If the file contains anything even just a newline its line count should be >= 1.525// If the file has a size of 0 its line count should be 0.526// Newlines belong to the line they started on so a file of \n means only 1 line527// This is the 'hot' path for the application and needs to be as fast as possible528func CountStats(fileJob *FileJob) {529 // For determining duplicates we need the below. The reason for creating530 // the byte array here is to avoid GC pressure. MD5 is in the standard library531 // and is fast enough to not warrant murmur3 hashing. No need to be532 // crypto secure here either so no need to eat the performance cost of a better533 // hash method534 if Duplicates {535 fileJob.Hash, _ = blake2b.New256(nil)536 }537538 // If the file has a length of 0 it is empty then we say it has no lines539 if fileJob.Bytes == 0 {540 fileJob.Lines = 0541 return542 }543544 LanguageFeaturesMutex.Lock()545 langFeatures := LanguageFeatures[fileJob.Language]546 LanguageFeaturesMutex.Unlock()547548 if langFeatures.Complexity == nil {549 langFeatures.Complexity = &Trie{}550 }551 if langFeatures.SingleLineComments == nil {552 langFeatures.SingleLineComments = &Trie{}553 }554 if langFeatures.MultiLineComments == nil {555 langFeatures.MultiLineComments = &Trie{}556 }557 if langFeatures.Strings == nil {558 langFeatures.Strings = &Trie{}559 }560 if langFeatures.Tokens == nil {561 langFeatures.Tokens = &Trie{}562 }563564 endPoint := int(fileJob.Bytes - 1)565 currentState := SBlank566 endComments := [][]byte{}567 endString := []byte{}568569 // TODO needs to be set via langFeatures.Quotes[0].IgnoreEscape for the matching feature570 ignoreEscape := false571 if fileJob.TrackComplexityLines {572 fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)573 }574575 if fileJob.ClassifyContent {576 fileJob.ContentByteType = make([]byte, fileJob.Bytes)577 }578579 for index := checkBomSkip(fileJob); index < int(fileJob.Bytes); index++ {580 if fileJob.ContentByteType != nil {581 fileJob.ContentByteType[index] = stateToByteType(currentState)582 }583584 // Based on our current state determine if the state should change by checking585 // what the character is. The below is very CPU bound so need to be careful if586 // changing anything in here and profile/measure afterwards!587 // NB that the order of the if statements matters and has been set to what in benchmarks is most efficient588 if !isWhitespace(fileJob.Content[index]) {589590 switch currentState {591 case SCode:592 index, currentState, endString, endComments, ignoreEscape = codeState(593 fileJob,594 index,595 endPoint,596 currentState,597 endString,598 endComments,599 langFeatures,600 &fileJob.Hash,601 )602 case SString:603 index, currentState = stringState(fileJob, index, endPoint, endString, currentState, ignoreEscape)604 case SDocString:605 // For a docstring we can either move into blank in which case we count it as a docstring606 // or back into code in which case it should be counted as code607 index, currentState = docStringState(fileJob, index, endPoint, endString, currentState)608 case SMulticomment, SMulticommentCode:609 index, currentState, endString, endComments = commentState(610 fileJob,611 index,612 endPoint,613 currentState,614 endComments,615 endString,616 langFeatures,617 )618 case SBlank, SMulticommentBlank:619 // From blank we can move into comment, move into a multiline comment620 // or move into code but we can only do one.621 index, currentState, endString, endComments, ignoreEscape = blankState(622 fileJob,623 index,624 currentState,625 endComments,626 endString,627 langFeatures,628 )629 }630 }631632 // We shouldn't normally need this, but unclosed strings or comments633 // might leave the index past the end of the file when we reach this634 // point.635 if index >= len(fileJob.Content) {636 return637 }638639 // Only check the first 10000 characters for null bytes indicating a binary file640 // and if we find it then we return otherwise carry on and ignore binary markers641 if index < 10000 && fileJob.Binary {642 return643 }644645 // This means the end of processing the line so calculate the stats according to what state646 // we are currently in647 if fileJob.Content[index] == '\n' || index >= endPoint {648 fileJob.Lines++649 if fileJob.TrackComplexityLines {650 fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)651 }652653 if NoLarge && fileJob.Lines >= LargeLineCount {654 // Save memory by unsetting the content as we no longer require it655 fileJob.Content = nil656 return657 }658659 switch currentState {660 case SCode, SString, SCommentCode, SMulticommentCode:661 fileJob.Code++662 currentState = resetState(currentState)663 if fileJob.Callback != nil {664 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_CODE) {665 return666 }667 }668 if Trace {669 // Don't remove the outside if-statements, for performance670 printTraceF("%s line %d ended with state: %d: counted as code", fileJob.Location, fileJob.Lines, currentState)671 }672 case SComment, SMulticomment, SMulticommentBlank:673 fileJob.Comment++674 currentState = resetState(currentState)675 if fileJob.Callback != nil {676 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {677 return678 }679 }680 if Trace {681 // Same as above682 printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)683 }684 case SBlank:685 fileJob.Blank++686 if fileJob.Callback != nil {687 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_BLANK) {688 return689 }690 }691 if Trace {692 // Same as above693 printTraceF("%s line %d ended with state: %d: counted as blank", fileJob.Location, fileJob.Lines, currentState)694 }695 case SDocString:696 fileJob.Comment++697 if fileJob.Callback != nil {698 if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {699 return700 }701 }702 if Trace {703 // Same as above704 printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)705 }706 }707 }708 }709710 if UlocMode {711 uloc := map[string]struct{}{}712 for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {713 uloc[l] = struct{}{}714 }715 fileJob.Uloc = len(uloc)716 }717718 if MaxMean {719 for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {720 fileJob.LineLength = append(fileJob.LineLength, len(l))721 }722 }723724 isGenerated := false725726 if Generated {727 headLen := min(1000, len(fileJob.Content))728 head := bytes.ToLower(fileJob.Content[0:headLen])729 for _, marker := range GeneratedMarkers {730 if bytes.Contains(head, bytes.ToLower([]byte(marker))) {731 fileJob.Generated = true732 fileJob.Language = fileJob.Language + " (gen)"733 isGenerated = true734 printWarnF("%s identified as isGenerated with heading comment", fileJob.Filename)735 break736 }737 }738 }739740 // check if 0 as well to avoid divide by zero https://github.com/boyter/scc/issues/223741 if !isGenerated && Minified && fileJob.Lines != 0 {742 avgLineByteCount := len(fileJob.Content) / int(fileJob.Lines)743 minifiedGeneratedCheck(avgLineByteCount, fileJob)744 }745746 if fileJob.TrackComplexityLines {747 fileJob.ComplexityLine = fileJob.ComplexityLine[:fileJob.Lines]748 }749}750751func minifiedGeneratedCheck(avgLineByteCount int, fileJob *FileJob) {752 if avgLineByteCount >= MinifiedGeneratedLineByteLength {753 fileJob.Minified = true754 fileJob.Language = fileJob.Language + " (min)"755 printWarnF("%s identified as minified/generated with average line byte length of %d >= %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)756 } else {757 printDebugF("%s not identified as minified/generated with average line byte length of %d < %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)758 }759}760761// Check if we have any Byte Order Marks (BOM) in front of the file762func checkBomSkip(fileJob *FileJob) int {763 // UTF-8 BOM which if detected we should skip the BOM as we can then count correctly764 // []byte is UTF-8 BOM taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding765 if bytes.HasPrefix(fileJob.Content, []byte{239, 187, 191}) {766 printWarnF("UTF-8 BOM found for file %s skipping 3 bytes", fileJob.Filename)767 return 3768 }769770 // If we have one of the other BOM then we might not be able to count correctly so if verbose let the user know771 if Verbose {772 for _, v := range ByteOrderMarks {773 if bytes.HasPrefix(fileJob.Content, v) {774 printWarnF("BOM found for file %s indicating it is not ASCII/UTF-8 and may be counted incorrectly or ignored as a binary file", fileJob.Filename)775 }776 }777 }778779 return 0780}781782// Reads and processes files from input chan in parallel, and sends results to783// output chan784func (ctx processorContext) fileProcessorWorker(input chan *FileJob, output chan *FileJob) {785 var startTime int64786 var fileCount int64787 var gcEnabled int64788 var wg sync.WaitGroup789790 for i := 0; i < FileProcessJobWorkers; i++ {791 wg.Go(func() {792 reader := NewFileReader()793794 for job := range input {795 atomic.CompareAndSwapInt64(&startTime, 0, makeTimestampMilli())796797 loc := job.Location798 if job.Symlocation != "" {799 loc = job.Symlocation800 }801802 fileStartTime := makeTimestampNano()803 content, err := reader.ReadFile(loc, int(job.Bytes))804 atomic.AddInt64(&fileCount, 1)805806 if atomic.LoadInt64(&gcEnabled) == 0 && atomic.LoadInt64(&fileCount) >= int64(GcFileCount) {807 debug.SetGCPercent(gcPercent)808 atomic.AddInt64(&gcEnabled, 1)809 printWarn("read file limit exceeded GC re-enabled")810 }811812 printTraceF("nanoseconds read into memory: %s: %d", job.Location, makeTimestampNano()-fileStartTime)813814 if err == nil {815 job.Content = content816 if ctx.processFile(job) {817 output <- job818 }819 } else {820 printWarnF("error reading: %s %s", job.Location, err)821 }822 }823824 })825 }826827 go func() {828 wg.Wait()829 close(output)830831 printDebugF("milliseconds reading files into memory: %d", makeTimestampMilli()-startTime)832 }()833}834835// Process a single file836// File must have been read to job.Content already837func (ctx processorContext) processFile(job *FileJob) bool {838 fileStartTime := makeTimestampNano()839840 contents := job.Content841842 // Needs to always run to ensure the language is set843 job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)844845 remapped := false846 if len(ctx.remap.all) != 0 {847 ctx.hardRemapLanguage(job)848 }849850 // If the type is #! we should check to see if we can identify851 if job.Language == SheBang {852 if len(ctx.remap.unknown) != 0 {853 remapped = ctx.unknownRemapLanguage(job)854 }855856 // if we didn't remap we then want to see if it's a #! map857 if !remapped {858 cutoff := min(200, len(contents))859860 lang, err := DetectSheBang(contents[:cutoff])861 if err != nil {862 printWarnF("unable to determine #! language for %s", job.Location)863 return false864 }865866 printWarnF("detected #! %s for %s", lang, job.Location)867 job.Language = lang868 LoadLanguageFeature(lang)869 }870 }871872 CountStats(job)873874 if Duplicates {875 duplicates.mux.Lock()876 jobHash := job.Hash.Sum(nil)877 if duplicates.Check(job.Bytes, jobHash) {878 printWarnF("skipping duplicate file: %s", job.Location)879 duplicates.mux.Unlock()880 return false881 }882883 duplicates.Add(job.Bytes, jobHash)884 duplicates.mux.Unlock()885 }886887 if IgnoreMinified && job.Minified {888 printWarnF("skipping minified file: %s", job.Location)889 return false890 }891892 if IgnoreGenerated && job.Generated {893 printWarnF("skipping generated file: %s", job.Location)894 return false895 }896897 if NoLarge && job.Lines >= LargeLineCount {898 printWarnF("skipping large file due to line length: %s", job.Location)899 return false900 }901902 printTraceF("nanoseconds process: %s: %d", job.Location, makeTimestampNano()-fileStartTime)903904 if job.Binary {905 printWarnF("skipping file identified as binary: %s", job.Location)906 return false907 }908909 // This needs to be at the end so we can ensure duplicate detection et.al run first910 // avoiding inflating the counts911 if UlocMode {912 ulocMutex.Lock()913914 for l := range strings.SplitSeq(strings.TrimRight(string(job.Content), "\n"), "\n") {915 ulocGlobalCount[l] = struct{}{}916917 _, ok := ulocLanguageCount[job.Language]918 if !ok {919 ulocLanguageCount[job.Language] = map[string]struct{}{}920 }921 ulocLanguageCount[job.Language][l] = struct{}{}922 }923 ulocMutex.Unlock()924 }925926 return true927}928929func (ctx processorContext) hardRemapLanguage(job *FileJob) bool {930 remapped := false931 cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look932933 for _, rule := range ctx.remap.all {934 if bytes.Contains(job.Content[:cutoff], rule.pattern) {935 job.Language = rule.language936 remapped = true937 printWarnF("hard remapping: %s to %s", job.Location, job.Language)938 }939 }940941 return remapped942}943944func (ctx processorContext) unknownRemapLanguage(job *FileJob) bool {945 remapped := false946 cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look947948 for _, rule := range ctx.remap.unknown {949 if bytes.Contains(job.Content[:cutoff], rule.pattern) {950 job.Language = rule.language951 remapped = true952 printWarnF("unknown remapping: %s to %s", job.Location, job.Language)953 }954 }955956 return remapped957}
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.