processor/workers.go GO 958 lines View on github.com → Search inside
1// SPDX-License-Identifier: MIT23package processor45import (6	"bytes"7	"hash"8	"runtime/debug"9	"strings"10	"sync"11	"sync/atomic"1213	"golang.org/x/crypto/blake2b"14)1516// The below are used as identifiers for the code state machine17const (18	SBlank             int64 = 119	SCode              int64 = 220	SComment           int64 = 321	SCommentCode       int64 = 4 // Indicates comment after code22	SMulticomment      int64 = 523	SMulticommentCode  int64 = 6 // Indicates multi comment after code24	SMulticommentBlank int64 = 7 // Indicates multi comment ended with blank afterward25	SString            int64 = 826	SDocString         int64 = 927)2829// SheBang is a global constant for indicating a shebang file header30const SheBang string = "#!"3132// LineType what type of line are processing33type LineType int323435// These are not meant to be CAMEL_CASE but as it us used by an external project we cannot change it36const (37	LINE_BLANK LineType = iota38	LINE_CODE39	LINE_COMMENT40)4142// ByteOrderMarks are taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding43// These indicate that we cannot count the file correctly so we can at least warn the user44var ByteOrderMarks = [][]byte{45	{254, 255},            // UTF-16 BE46	{255, 254},            // UTF-16 LE47	{0, 0, 254, 255},      // UTF-32 BE48	{255, 254, 0, 0},      // UTF-32 LE49	{43, 47, 118, 56},     // UTF-750	{43, 47, 118, 57},     // UTF-751	{43, 47, 118, 43},     // UTF-752	{43, 47, 118, 47},     // UTF-753	{43, 47, 118, 56, 45}, // UTF-754	{247, 100, 76},        // UTF-155	{221, 115, 102, 115},  // UTF-EBCDIC56	{14, 254, 255},        // SCSU57	{251, 238, 40},        // BOCU-158	{132, 49, 149, 51},    // GB-1803059}6061var duplicates = CheckDuplicates{62	hashes: make(map[int64][][]byte),63}6465func checkForMatchSingle(currentByte byte, index int, endPoint int, matches []byte, fileJob *FileJob) bool {66	potentialMatch := true67	if currentByte == matches[0] {68		for j := range matches {69			if index+j >= endPoint || matches[j] != fileJob.Content[index+j] {70				potentialMatch = false71				break72			}73		}7475		if potentialMatch {76			return true77		}78	}7980	return false81}8283func isWhitespace(currentByte byte) bool {84	if currentByte != ' ' && currentByte != '\t' && currentByte != '\n' && currentByte != '\r' {85		return false86	}8788	return true89}9091func isIdentifierContinue(b byte) bool {92	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'93}9495func hasNonWhitespaceBefore(content []byte, index int) bool {96	for i := index - 1; i >= 0; i-- {97		if !isWhitespace(content[i]) {98			return true99		}100	}101102	return false103}104105func nextNonWhitespaceIndex(content []byte, index int) int {106	for index < len(content) && isWhitespace(content[index]) {107		index++108	}109110	return index111}112113func hasPostfixExclude(content []byte, index, offsetJump int, excludes [][]byte) bool {114	token := content[index : index+offsetJump]115	for _, exclude := range excludes {116		if len(exclude) < offsetJump || !bytes.Equal(token, exclude[:offsetJump]) {117			continue118		}119120		remaining := exclude[offsetJump:]121		if len(remaining) == 0 {122			return true123		}124125		next := nextNonWhitespaceIndex(content, index+offsetJump)126		if next+len(remaining) > len(content) || !bytes.Equal(content[next:next+len(remaining)], remaining) {127			continue128		}129130		afterExclude := next + len(remaining)131		if isIdentifierContinue(remaining[len(remaining)-1]) {132			return afterExclude == len(content) || !isIdentifierContinue(content[afterExclude])133		}134135		return true136	}137138	return false139}140141func countComplexityPostfix(fileJob *FileJob, index, offsetJump int, postfixExcludes [][]byte) {142	if index == 0 {143		return144	}145146	content := fileJob.Content147	if isWhitespace(content[index-1]) && !hasNonWhitespaceBefore(content, index-1) {148		return149	}150151	if len(postfixExcludes) > 0 && hasPostfixExclude(content, index, offsetJump, postfixExcludes) {152		return153	}154155	fileJob.Complexity++156	fileJob.bumpComplexityLine()157}158159// bumpComplexityLine adds one complexity tick to the line currently being160// counted. No-op when TrackComplexityLines is off — ComplexityLine is left161// empty by CountStats in that case, so there is no slot to bump.162func (fileJob *FileJob) bumpComplexityLine() {163	if n := len(fileJob.ComplexityLine); n > 0 {164		fileJob.ComplexityLine[n-1]++165	}166}167168// Check if this file is binary by checking for nul byte and if so bail out169// this is how GNU Grep, git and ripgrep check for binary files170func isBinary(index int, currentByte byte) bool {171	return index < 10000 && !DisableCheckBinary && currentByte == 0172}173174func shouldProcess(currentByte, processBytesMask byte) bool {175	return currentByte&processBytesMask == currentByte176}177178func stateToByteType(state int64) byte {179	switch state {180	case SCode:181		return ByteTypeCode182	case SString:183		return ByteTypeString184	case SComment, SCommentCode, SMulticomment, SMulticommentCode, SMulticommentBlank, SDocString:185		return ByteTypeComment186	default: // SBlank187		return ByteTypeBlank188	}189}190191func resetState(currentState int64) int64 {192	switch currentState {193	case SMulticomment, SMulticommentCode:194		currentState = SMulticomment195	case SString:196		currentState = SString197	default:198		currentState = SBlank199	}200201	return currentState202}203204func stringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64, ignoreEscape bool) (int, int64) {205	// It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here206	// without checking if it is out of bounds first207	for i := index; i < endPoint; i++ {208		index = i209210		if fileJob.ContentByteType != nil {211			fileJob.ContentByteType[i] = ByteTypeString212		}213214		// If we hit a newline, return because we want to count the stats but keep215		// the current state so we end up back in this loop when the outer216		// one calls again217		if fileJob.Content[i] == '\n' {218			return i, currentState219		}220221		is_escaped := false222		// if there is an escape symbol before us, investigate223		if fileJob.Content[i-1] == '\\' {224			num_escapes := 0225			for j := i - 1; j > 0; j-- {226				if fileJob.Content[j] != '\\' {227					break228				}229				num_escapes++230			}231232			// if number of escapes is even, all escapes are themselves escaped233			// otherwise the last escape does escape current string terminator234			if num_escapes%2 != 0 {235				is_escaped = true236			}237		}238239		// If we are in a literal string we want to ignore escapes OR we aren't checking for special ones240		if ignoreEscape || !is_escaped {241			if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {242				return i, SCode243			}244		}245	}246247	return index, currentState248}249250// This is a special state check pretty much only ever used by Python codebases251// but potentially it could be expanded to deal with other types252func docStringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64) (int, int64) {253	// It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here254	// without checking if it is out of bounds first255	for i := index; i < endPoint; i++ {256		index = i257258		if fileJob.ContentByteType != nil {259			fileJob.ContentByteType[i] = ByteTypeComment260		}261262		if fileJob.Content[i] == '\n' {263			return i, currentState264		}265266		if fileJob.Content[i-1] != '\\' {267			if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {268				// So we have hit end of docstring at this point in which case check if only whitespace characters till the next269				// newline and if so we change to a comment otherwise to code270				// need to start the loop after ending definition of docstring, therefore adding the length of the string to271				// the index272				for j := index + len(endString); j <= endPoint; j++ {273					if fileJob.Content[j] == '\n' {274						printDebug("Found newline so docstring is comment")275						return i, SComment276					}277278					if !isWhitespace(fileJob.Content[j]) {279						printDebugF("Found something not whitespace so is code: %s", string(fileJob.Content[j]))280						return i, SCode281					}282				}283284				return i, SCode285			}286		}287	}288289	return index, currentState290}291292func codeState(293	fileJob *FileJob,294	index int,295	endPoint int,296	currentState int64,297	endString []byte,298	endComments [][]byte,299	langFeatures LanguageFeature,300	digest *hash.Hash,301) (int, int64, []byte, [][]byte, bool) {302	// Hacky fix to https://github.com/boyter/scc/issues/181303	if endPoint > len(fileJob.Content) {304		endPoint--305	}306307	for i := index; i < endPoint; i++ {308		curByte := fileJob.Content[i]309		index = i310311		if fileJob.ContentByteType != nil {312			fileJob.ContentByteType[i] = ByteTypeCode313		}314315		if curByte == '\n' {316			return i, currentState, endString, endComments, false317		}318319		if isBinary(i, curByte) {320			fileJob.Binary = true321			return i, currentState, endString, endComments, false322		}323324		if shouldProcess(curByte, langFeatures.ProcessMask) {325			if Duplicates {326				// Technically this is wrong because we skip bytes, so this is not a true327				// hash of the file contents, but for duplicate files it shouldn't matter328				// as both will skip the same way329				digestible := []byte{fileJob.Content[index]}330				(*digest).Write(digestible)331			}332333			switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[i:]); tokenType {334			case TString:335				// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string336				i, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)337338				// It is safe to -1 here as to enter the code state we need to have339				// transitioned from blank to here hence i should always be >= 1340				// This check is to ensure we aren't in a character declaration341				// TODO this should use language features342				if fileJob.Content[i-1] != '\\' {343					currentState = SString344				}345346				return i, currentState, endString, endComments, ignoreEscape347348			case TSlcomment:349				currentState = SCommentCode350				return i, currentState, endString, endComments, false351352			case TMlcomment:353				if langFeatures.Nested || len(endComments) == 0 {354					endComments = append(endComments, endString)355					currentState = SMulticommentCode356					i += offsetJump - 1357358					return i, currentState, endString, endComments, false359				}360361			case TComplexity:362				if index == 0 || isWhitespace(fileJob.Content[index-1]) {363					fileJob.Complexity++364					fileJob.bumpComplexityLine()365				}366367			case TComplexityPostfix:368				countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)369			}370		}371	}372373	return index, currentState, endString, endComments, false374}375376func commentState(fileJob *FileJob, index int, endPoint int, currentState int64, endComments [][]byte, endString []byte, langFeatures LanguageFeature) (int, int64, []byte, [][]byte) {377	for i := index; i < endPoint; i++ {378		curByte := fileJob.Content[i]379		index = i380381		if fileJob.ContentByteType != nil {382			fileJob.ContentByteType[i] = ByteTypeComment383		}384385		if curByte == '\n' {386			return i, currentState, endString, endComments387		}388389		if checkForMatchSingle(curByte, index, endPoint, endComments[len(endComments)-1], fileJob) {390			// set offset jump here391			offsetJump := len(endComments[len(endComments)-1])392			endComments = endComments[:len(endComments)-1]393394			if len(endComments) == 0 {395				// If we started as multiline code switch back to code so we count correctly396				// IE i := 1 /* for the lols */397				// TODO is that required? Might still be required to count correctly398				if currentState == SMulticommentCode {399					currentState = SCode // TODO pointless to change here, just set S_MULTICOMMENT_BLANK400				} else {401					currentState = SMulticommentBlank402				}403			}404405			i += offsetJump - 1406			return i, currentState, endString, endComments407		}408		// Check if we are entering another multiline comment409		// This should come below check for match single as it speeds up processing410		if langFeatures.Nested || len(endComments) == 0 {411			if ok, offsetJump, endString := langFeatures.MultiLineComments.Match(fileJob.Content[i:]); ok != 0 {412				endComments = append(endComments, endString)413				i += offsetJump - 1414415				return i, currentState, endString, endComments416			}417		}418	}419420	return index, currentState, endString, endComments421}422423func blankState(424	fileJob *FileJob,425	index int,426	currentState int64,427	endComments [][]byte,428	endString []byte,429	langFeatures LanguageFeature,430) (int, int64, []byte, [][]byte, bool) {431	switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[index:]); tokenType {432	case TMlcomment:433		if langFeatures.Nested || len(endComments) == 0 {434			endComments = append(endComments, endString)435			currentState = SMulticomment436			index += offsetJump - 1437			if fileJob.ContentByteType != nil {438				fileJob.ContentByteType[index] = ByteTypeComment439			}440			return index, currentState, endString, endComments, false441		}442443	case TSlcomment:444		currentState = SComment445		if fileJob.ContentByteType != nil {446			fileJob.ContentByteType[index] = ByteTypeComment447		}448		return index, currentState, endString, endComments, false449450	case TString:451		index, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)452453		for _, v := range langFeatures.Quotes {454			if v.End == string(endString) && v.DocString {455				currentState = SDocString456				if fileJob.ContentByteType != nil {457					fileJob.ContentByteType[index] = ByteTypeComment458				}459				return index, currentState, endString, endComments, ignoreEscape460			}461		}462		currentState = SString463		if fileJob.ContentByteType != nil {464			fileJob.ContentByteType[index] = ByteTypeString465		}466		return index, currentState, endString, endComments, ignoreEscape467468	case TComplexity:469		currentState = SCode470		if fileJob.ContentByteType != nil {471			fileJob.ContentByteType[index] = ByteTypeCode472		}473		if index == 0 || isWhitespace(fileJob.Content[index-1]) {474			fileJob.Complexity++475			fileJob.bumpComplexityLine()476		}477478	case TComplexityPostfix:479		currentState = SCode480		if fileJob.ContentByteType != nil {481			fileJob.ContentByteType[index] = ByteTypeCode482		}483		countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)484485	default:486		currentState = SCode487		if fileJob.ContentByteType != nil {488			fileJob.ContentByteType[index] = ByteTypeCode489		}490	}491492	return index, currentState, endString, endComments, false493}494495// Some languages such as C# have quoted strings like @"\" where no escape character is required496// this checks if there is one so we can cater for these cases497func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index int) (int, bool) {498	ignoreEscape := false499500	// loop over the string states and if we have the special flag match, and if so we need to ensure we can handle them501	for i := 0; i < len(langFeatures.Quotes); i++ {502		if langFeatures.Quotes[i].DocString || langFeatures.Quotes[i].IgnoreEscape {503			// If so we need to check if where we are falls into these conditions504			isMatch := true505			for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {506				if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {507					isMatch = false508					break509				}510			}511512			// If we have a match then jump ahead enough so we don't pick it up again for cases like @"513			if isMatch {514				ignoreEscape = true515				index = index + len(langFeatures.Quotes[i].Start)516			}517		}518	}519520	return index, ignoreEscape521}522523// CountStats will process the fileJob524// If the file contains anything even just a newline its line count should be >= 1.525// If the file has a size of 0 its line count should be 0.526// Newlines belong to the line they started on so a file of \n means only 1 line527// This is the 'hot' path for the application and needs to be as fast as possible528func CountStats(fileJob *FileJob) {529	// For determining duplicates we need the below. The reason for creating530	// the byte array here is to avoid GC pressure. MD5 is in the standard library531	// and is fast enough to not warrant murmur3 hashing. No need to be532	// crypto secure here either so no need to eat the performance cost of a better533	// hash method534	if Duplicates {535		fileJob.Hash, _ = blake2b.New256(nil)536	}537538	// If the file has a length of 0 it is empty then we say it has no lines539	if fileJob.Bytes == 0 {540		fileJob.Lines = 0541		return542	}543544	LanguageFeaturesMutex.Lock()545	langFeatures := LanguageFeatures[fileJob.Language]546	LanguageFeaturesMutex.Unlock()547548	if langFeatures.Complexity == nil {549		langFeatures.Complexity = &Trie{}550	}551	if langFeatures.SingleLineComments == nil {552		langFeatures.SingleLineComments = &Trie{}553	}554	if langFeatures.MultiLineComments == nil {555		langFeatures.MultiLineComments = &Trie{}556	}557	if langFeatures.Strings == nil {558		langFeatures.Strings = &Trie{}559	}560	if langFeatures.Tokens == nil {561		langFeatures.Tokens = &Trie{}562	}563564	endPoint := int(fileJob.Bytes - 1)565	currentState := SBlank566	endComments := [][]byte{}567	endString := []byte{}568569	// TODO needs to be set via langFeatures.Quotes[0].IgnoreEscape for the matching feature570	ignoreEscape := false571	if fileJob.TrackComplexityLines {572		fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)573	}574575	if fileJob.ClassifyContent {576		fileJob.ContentByteType = make([]byte, fileJob.Bytes)577	}578579	for index := checkBomSkip(fileJob); index < int(fileJob.Bytes); index++ {580		if fileJob.ContentByteType != nil {581			fileJob.ContentByteType[index] = stateToByteType(currentState)582		}583584		// Based on our current state determine if the state should change by checking585		// what the character is. The below is very CPU bound so need to be careful if586		// changing anything in here and profile/measure afterwards!587		// NB that the order of the if statements matters and has been set to what in benchmarks is most efficient588		if !isWhitespace(fileJob.Content[index]) {589590			switch currentState {591			case SCode:592				index, currentState, endString, endComments, ignoreEscape = codeState(593					fileJob,594					index,595					endPoint,596					currentState,597					endString,598					endComments,599					langFeatures,600					&fileJob.Hash,601				)602			case SString:603				index, currentState = stringState(fileJob, index, endPoint, endString, currentState, ignoreEscape)604			case SDocString:605				// For a docstring we can either move into blank in which case we count it as a docstring606				// or back into code in which case it should be counted as code607				index, currentState = docStringState(fileJob, index, endPoint, endString, currentState)608			case SMulticomment, SMulticommentCode:609				index, currentState, endString, endComments = commentState(610					fileJob,611					index,612					endPoint,613					currentState,614					endComments,615					endString,616					langFeatures,617				)618			case SBlank, SMulticommentBlank:619				// From blank we can move into comment, move into a multiline comment620				// or move into code but we can only do one.621				index, currentState, endString, endComments, ignoreEscape = blankState(622					fileJob,623					index,624					currentState,625					endComments,626					endString,627					langFeatures,628				)629			}630		}631632		// We shouldn't normally need this, but unclosed strings or comments633		// might leave the index past the end of the file when we reach this634		// point.635		if index >= len(fileJob.Content) {636			return637		}638639		// Only check the first 10000 characters for null bytes indicating a binary file640		// and if we find it then we return otherwise carry on and ignore binary markers641		if index < 10000 && fileJob.Binary {642			return643		}644645		// This means the end of processing the line so calculate the stats according to what state646		// we are currently in647		if fileJob.Content[index] == '\n' || index >= endPoint {648			fileJob.Lines++649			if fileJob.TrackComplexityLines {650				fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)651			}652653			if NoLarge && fileJob.Lines >= LargeLineCount {654				// Save memory by unsetting the content as we no longer require it655				fileJob.Content = nil656				return657			}658659			switch currentState {660			case SCode, SString, SCommentCode, SMulticommentCode:661				fileJob.Code++662				currentState = resetState(currentState)663				if fileJob.Callback != nil {664					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_CODE) {665						return666					}667				}668				if Trace {669					// Don't remove the outside if-statements, for performance670					printTraceF("%s line %d ended with state: %d: counted as code", fileJob.Location, fileJob.Lines, currentState)671				}672			case SComment, SMulticomment, SMulticommentBlank:673				fileJob.Comment++674				currentState = resetState(currentState)675				if fileJob.Callback != nil {676					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {677						return678					}679				}680				if Trace {681					// Same as above682					printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)683				}684			case SBlank:685				fileJob.Blank++686				if fileJob.Callback != nil {687					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_BLANK) {688						return689					}690				}691				if Trace {692					// Same as above693					printTraceF("%s line %d ended with state: %d: counted as blank", fileJob.Location, fileJob.Lines, currentState)694				}695			case SDocString:696				fileJob.Comment++697				if fileJob.Callback != nil {698					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {699						return700					}701				}702				if Trace {703					// Same as above704					printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)705				}706			}707		}708	}709710	if UlocMode {711		uloc := map[string]struct{}{}712		for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {713			uloc[l] = struct{}{}714		}715		fileJob.Uloc = len(uloc)716	}717718	if MaxMean {719		for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {720			fileJob.LineLength = append(fileJob.LineLength, len(l))721		}722	}723724	isGenerated := false725726	if Generated {727		headLen := min(1000, len(fileJob.Content))728		head := bytes.ToLower(fileJob.Content[0:headLen])729		for _, marker := range GeneratedMarkers {730			if bytes.Contains(head, bytes.ToLower([]byte(marker))) {731				fileJob.Generated = true732				fileJob.Language = fileJob.Language + " (gen)"733				isGenerated = true734				printWarnF("%s identified as isGenerated with heading comment", fileJob.Filename)735				break736			}737		}738	}739740	// check if 0 as well to avoid divide by zero https://github.com/boyter/scc/issues/223741	if !isGenerated && Minified && fileJob.Lines != 0 {742		avgLineByteCount := len(fileJob.Content) / int(fileJob.Lines)743		minifiedGeneratedCheck(avgLineByteCount, fileJob)744	}745746	if fileJob.TrackComplexityLines {747		fileJob.ComplexityLine = fileJob.ComplexityLine[:fileJob.Lines]748	}749}750751func minifiedGeneratedCheck(avgLineByteCount int, fileJob *FileJob) {752	if avgLineByteCount >= MinifiedGeneratedLineByteLength {753		fileJob.Minified = true754		fileJob.Language = fileJob.Language + " (min)"755		printWarnF("%s identified as minified/generated with average line byte length of %d >= %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)756	} else {757		printDebugF("%s not identified as minified/generated with average line byte length of %d < %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)758	}759}760761// Check if we have any Byte Order Marks (BOM) in front of the file762func checkBomSkip(fileJob *FileJob) int {763	// UTF-8 BOM which if detected we should skip the BOM as we can then count correctly764	// []byte is UTF-8 BOM taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding765	if bytes.HasPrefix(fileJob.Content, []byte{239, 187, 191}) {766		printWarnF("UTF-8 BOM found for file %s skipping 3 bytes", fileJob.Filename)767		return 3768	}769770	// If we have one of the other BOM then we might not be able to count correctly so if verbose let the user know771	if Verbose {772		for _, v := range ByteOrderMarks {773			if bytes.HasPrefix(fileJob.Content, v) {774				printWarnF("BOM found for file %s indicating it is not ASCII/UTF-8 and may be counted incorrectly or ignored as a binary file", fileJob.Filename)775			}776		}777	}778779	return 0780}781782// Reads and processes files from input chan in parallel, and sends results to783// output chan784func (ctx processorContext) fileProcessorWorker(input chan *FileJob, output chan *FileJob) {785	var startTime int64786	var fileCount int64787	var gcEnabled int64788	var wg sync.WaitGroup789790	for i := 0; i < FileProcessJobWorkers; i++ {791		wg.Go(func() {792			reader := NewFileReader()793794			for job := range input {795				atomic.CompareAndSwapInt64(&startTime, 0, makeTimestampMilli())796797				loc := job.Location798				if job.Symlocation != "" {799					loc = job.Symlocation800				}801802				fileStartTime := makeTimestampNano()803				content, err := reader.ReadFile(loc, int(job.Bytes))804				atomic.AddInt64(&fileCount, 1)805806				if atomic.LoadInt64(&gcEnabled) == 0 && atomic.LoadInt64(&fileCount) >= int64(GcFileCount) {807					debug.SetGCPercent(gcPercent)808					atomic.AddInt64(&gcEnabled, 1)809					printWarn("read file limit exceeded GC re-enabled")810				}811812				printTraceF("nanoseconds read into memory: %s: %d", job.Location, makeTimestampNano()-fileStartTime)813814				if err == nil {815					job.Content = content816					if ctx.processFile(job) {817						output <- job818					}819				} else {820					printWarnF("error reading: %s %s", job.Location, err)821				}822			}823824		})825	}826827	go func() {828		wg.Wait()829		close(output)830831		printDebugF("milliseconds reading files into memory: %d", makeTimestampMilli()-startTime)832	}()833}834835// Process a single file836// File must have been read to job.Content already837func (ctx processorContext) processFile(job *FileJob) bool {838	fileStartTime := makeTimestampNano()839840	contents := job.Content841842	// Needs to always run to ensure the language is set843	job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)844845	remapped := false846	if len(ctx.remap.all) != 0 {847		ctx.hardRemapLanguage(job)848	}849850	// If the type is #! we should check to see if we can identify851	if job.Language == SheBang {852		if len(ctx.remap.unknown) != 0 {853			remapped = ctx.unknownRemapLanguage(job)854		}855856		// if we didn't remap we then want to see if it's a #! map857		if !remapped {858			cutoff := min(200, len(contents))859860			lang, err := DetectSheBang(contents[:cutoff])861			if err != nil {862				printWarnF("unable to determine #! language for %s", job.Location)863				return false864			}865866			printWarnF("detected #! %s for %s", lang, job.Location)867			job.Language = lang868			LoadLanguageFeature(lang)869		}870	}871872	CountStats(job)873874	if Duplicates {875		duplicates.mux.Lock()876		jobHash := job.Hash.Sum(nil)877		if duplicates.Check(job.Bytes, jobHash) {878			printWarnF("skipping duplicate file: %s", job.Location)879			duplicates.mux.Unlock()880			return false881		}882883		duplicates.Add(job.Bytes, jobHash)884		duplicates.mux.Unlock()885	}886887	if IgnoreMinified && job.Minified {888		printWarnF("skipping minified file: %s", job.Location)889		return false890	}891892	if IgnoreGenerated && job.Generated {893		printWarnF("skipping generated file: %s", job.Location)894		return false895	}896897	if NoLarge && job.Lines >= LargeLineCount {898		printWarnF("skipping large file due to line length: %s", job.Location)899		return false900	}901902	printTraceF("nanoseconds process: %s: %d", job.Location, makeTimestampNano()-fileStartTime)903904	if job.Binary {905		printWarnF("skipping file identified as binary: %s", job.Location)906		return false907	}908909	// This needs to be at the end so we can ensure duplicate detection et.al run first910	// avoiding inflating the counts911	if UlocMode {912		ulocMutex.Lock()913914		for l := range strings.SplitSeq(strings.TrimRight(string(job.Content), "\n"), "\n") {915			ulocGlobalCount[l] = struct{}{}916917			_, ok := ulocLanguageCount[job.Language]918			if !ok {919				ulocLanguageCount[job.Language] = map[string]struct{}{}920			}921			ulocLanguageCount[job.Language][l] = struct{}{}922		}923		ulocMutex.Unlock()924	}925926	return true927}928929func (ctx processorContext) hardRemapLanguage(job *FileJob) bool {930	remapped := false931	cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look932933	for _, rule := range ctx.remap.all {934		if bytes.Contains(job.Content[:cutoff], rule.pattern) {935			job.Language = rule.language936			remapped = true937			printWarnF("hard remapping: %s to %s", job.Location, job.Language)938		}939	}940941	return remapped942}943944func (ctx processorContext) unknownRemapLanguage(job *FileJob) bool {945	remapped := false946	cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look947948	for _, rule := range ctx.remap.unknown {949		if bytes.Contains(job.Content[:cutoff], rule.pattern) {950			job.Language = rule.language951			remapped = true952			printWarnF("unknown remapping: %s to %s", job.Location, job.Language)953		}954	}955956	return remapped957}

Code quality findings 12

Blank identifier discarding results; verify intentional ignoring of return values
warning correctness blank-identifier-discard
fileJob.Hash, _ = blake2b.New256(nil)
Ensure errors are handled or logged
warning correctness unhandled-error
if err != nil {
Ensure paired with Unlock defer to prevent deadlocks
warning correctness lock-without-unlock
ulocMutex.Lock()
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
info maintainability deep-nesting
if fileJob.Content[i] == '\n' {
Multiple appends without pre-allocation; use make() with capacity when size is known
info performance append-without-prealloc
endComments = append(endComments, endString)
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
info maintainability deep-nesting
for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
info maintainability deep-nesting
if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {
Multiple appends without pre-allocation; use make() with capacity when size is known
info performance append-without-prealloc
fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)
Multiple appends without pre-allocation; use make() with capacity when size is known
info performance append-without-prealloc
fileJob.LineLength = append(fileJob.LineLength, len(l))
String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop
info correctness string-to-byte-in-loop
if bytes.Contains(head, bytes.ToLower([]byte(marker))) {
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
info maintainability deep-nesting
for job := range input {
Adjusting garbage collection settings dynamically can be a sign of deeper problems in the codebase, suggesting a need for better coding practices
info correctness gc-tuning
debug.SetGCPercent(gcPercent)

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.