processor/workers.go · boyter/scc

1// SPDX-License-Identifier: MIT23package processor45import (6	"bytes"7	"hash"8	"runtime/debug"9	"strings"10	"sync"11	"sync/atomic"1213	"golang.org/x/crypto/blake2b"14)1516// The below are used as identifiers for the code state machine17const (18	SBlank             int64 = 119	SCode              int64 = 220	SComment           int64 = 321	SCommentCode       int64 = 4 // Indicates comment after code22	SMulticomment      int64 = 523	SMulticommentCode  int64 = 6 // Indicates multi comment after code24	SMulticommentBlank int64 = 7 // Indicates multi comment ended with blank afterward25	SString            int64 = 826	SDocString         int64 = 927)2829// SheBang is a global constant for indicating a shebang file header30const SheBang string = "#!"3132// LineType what type of line are processing33type LineType int323435// These are not meant to be CAMEL_CASE but as it us used by an external project we cannot change it36const (37	LINE_BLANK LineType = iota38	LINE_CODE39	LINE_COMMENT40)4142// ByteOrderMarks are taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding43// These indicate that we cannot count the file correctly so we can at least warn the user44var ByteOrderMarks = [][]byte{45	{254, 255},            // UTF-16 BE46	{255, 254},            // UTF-16 LE47	{0, 0, 254, 255},      // UTF-32 BE48	{255, 254, 0, 0},      // UTF-32 LE49	{43, 47, 118, 56},     // UTF-750	{43, 47, 118, 57},     // UTF-751	{43, 47, 118, 43},     // UTF-752	{43, 47, 118, 47},     // UTF-753	{43, 47, 118, 56, 45}, // UTF-754	{247, 100, 76},        // UTF-155	{221, 115, 102, 115},  // UTF-EBCDIC56	{14, 254, 255},        // SCSU57	{251, 238, 40},        // BOCU-158	{132, 49, 149, 51},    // GB-1803059}6061var duplicates = CheckDuplicates{62	hashes: make(map[int64][][]byte),63}6465func checkForMatchSingle(currentByte byte, index int, endPoint int, matches []byte, fileJob *FileJob) bool {66	potentialMatch := true67	if currentByte == matches[0] {68		for j := 0; j < len(matches); j++ {69			if index+j >= endPoint || matches[j] != fileJob.Content[index+j] {70				potentialMatch = false71				break72			}73		}7475		if potentialMatch {76			return true77		}78	}7980	return false81}8283func isWhitespace(currentByte byte) bool {84	if currentByte != ' ' && currentByte != '\t' && currentByte != '\n' && currentByte != '\r' {85		return false86	}8788	return true89}9091func isIdentifierContinue(b byte) bool {92	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'93}9495func hasNonWhitespaceBefore(content []byte, index int) bool {96	for i := index - 1; i >= 0; i-- {97		if !isWhitespace(content[i]) {98			return true99		}100	}101102	return false103}104105func nextNonWhitespaceIndex(content []byte, index int) int {106	for index < len(content) && isWhitespace(content[index]) {107		index++108	}109110	return index111}112113func hasPostfixExclude(content []byte, index, offsetJump int, excludes [][]byte) bool {114	token := content[index : index+offsetJump]115	for _, exclude := range excludes {116		if len(exclude) < offsetJump || !bytes.Equal(token, exclude[:offsetJump]) {117			continue118		}119120		remaining := exclude[offsetJump:]121		if len(remaining) == 0 {122			return true123		}124125		next := nextNonWhitespaceIndex(content, index+offsetJump)126		if next+len(remaining) > len(content) || !bytes.Equal(content[next:next+len(remaining)], remaining) {127			continue128		}129130		afterExclude := next + len(remaining)131		if isIdentifierContinue(remaining[len(remaining)-1]) {132			return afterExclude == len(content) || !isIdentifierContinue(content[afterExclude])133		}134135		return true136	}137138	return false139}140141func countComplexityPostfix(fileJob *FileJob, index, offsetJump int, postfixExcludes [][]byte) {142	if index == 0 {143		return144	}145146	content := fileJob.Content147	if isWhitespace(content[index-1]) && !hasNonWhitespaceBefore(content, index-1) {148		return149	}150151	if len(postfixExcludes) > 0 && hasPostfixExclude(content, index, offsetJump, postfixExcludes) {152		return153	}154155	fileJob.Complexity++156	fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1157}158159// Check if this file is binary by checking for nul byte and if so bail out160// this is how GNU Grep, git and ripgrep check for binary files161func isBinary(index int, currentByte byte) bool {162	return index < 10000 && !DisableCheckBinary && currentByte == 0163}164165func shouldProcess(currentByte, processBytesMask byte) bool {166	return currentByte&processBytesMask == currentByte167}168169func stateToByteType(state int64) byte {170	switch state {171	case SCode:172		return ByteTypeCode173	case SString:174		return ByteTypeString175	case SComment, SCommentCode, SMulticomment, SMulticommentCode, SMulticommentBlank, SDocString:176		return ByteTypeComment177	default: // SBlank178		return ByteTypeBlank179	}180}181182func resetState(currentState int64) int64 {183	switch currentState {184	case SMulticomment, SMulticommentCode:185		currentState = SMulticomment186	case SString:187		currentState = SString188	default:189		currentState = SBlank190	}191192	return currentState193}194195func stringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64, ignoreEscape bool) (int, int64) {196	// It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here197	// without checking if it is out of bounds first198	for i := index; i < endPoint; i++ {199		index = i200201		if fileJob.ContentByteType != nil {202			fileJob.ContentByteType[i] = ByteTypeString203		}204205		// If we hit a newline, return because we want to count the stats but keep206		// the current state so we end up back in this loop when the outer207		// one calls again208		if fileJob.Content[i] == '\n' {209			return i, currentState210		}211212		is_escaped := false213		// if there is an escape symbol before us, investigate214		if fileJob.Content[i-1] == '\\' {215			num_escapes := 0216			for j := i - 1; j > 0; j-- {217				if fileJob.Content[j] != '\\' {218					break219				}220				num_escapes++221			}222223			// if number of escapes is even, all escapes are themselves escaped224			// otherwise the last escape does escape current string terminator225			if num_escapes%2 != 0 {226				is_escaped = true227			}228		}229230		// If we are in a literal string we want to ignore escapes OR we aren't checking for special ones231		if ignoreEscape || !is_escaped {232			if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {233				return i, SCode234			}235		}236	}237238	return index, currentState239}240241// This is a special state check pretty much only ever used by Python codebases242// but potentially it could be expanded to deal with other types243func docStringState(fileJob *FileJob, index int, endPoint int, endString []byte, currentState int64) (int, int64) {244	// It's not possible to enter this state without checking at least 1 byte so it is safe to check -1 here245	// without checking if it is out of bounds first246	for i := index; i < endPoint; i++ {247		index = i248249		if fileJob.ContentByteType != nil {250			fileJob.ContentByteType[i] = ByteTypeComment251		}252253		if fileJob.Content[i] == '\n' {254			return i, currentState255		}256257		if fileJob.Content[i-1] != '\\' {258			if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) {259				// So we have hit end of docstring at this point in which case check if only whitespace characters till the next260				// newline and if so we change to a comment otherwise to code261				// need to start the loop after ending definition of docstring, therefore adding the length of the string to262				// the index263				for j := index + len(endString); j <= endPoint; j++ {264					if fileJob.Content[j] == '\n' {265						printDebug("Found newline so docstring is comment")266						return i, SComment267					}268269					if !isWhitespace(fileJob.Content[j]) {270						printDebugF("Found something not whitespace so is code: %s", string(fileJob.Content[j]))271						return i, SCode272					}273				}274275				return i, SCode276			}277		}278	}279280	return index, currentState281}282283func codeState(284	fileJob *FileJob,285	index int,286	endPoint int,287	currentState int64,288	endString []byte,289	endComments [][]byte,290	langFeatures LanguageFeature,291	digest *hash.Hash,292) (int, int64, []byte, [][]byte, bool) {293	// Hacky fix to https://github.com/boyter/scc/issues/181294	if endPoint > len(fileJob.Content) {295		endPoint--296	}297298	for i := index; i < endPoint; i++ {299		curByte := fileJob.Content[i]300		index = i301302		if fileJob.ContentByteType != nil {303			fileJob.ContentByteType[i] = ByteTypeCode304		}305306		if curByte == '\n' {307			return i, currentState, endString, endComments, false308		}309310		if isBinary(i, curByte) {311			fileJob.Binary = true312			return i, currentState, endString, endComments, false313		}314315		if shouldProcess(curByte, langFeatures.ProcessMask) {316			if Duplicates {317				// Technically this is wrong because we skip bytes, so this is not a true318				// hash of the file contents, but for duplicate files it shouldn't matter319				// as both will skip the same way320				digestible := []byte{fileJob.Content[index]}321				(*digest).Write(digestible)322			}323324			switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[i:]); tokenType {325			case TString:326				// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string327				i, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)328329				// It is safe to -1 here as to enter the code state we need to have330				// transitioned from blank to here hence i should always be >= 1331				// This check is to ensure we aren't in a character declaration332				// TODO this should use language features333				if fileJob.Content[i-1] != '\\' {334					currentState = SString335				}336337				return i, currentState, endString, endComments, ignoreEscape338339			case TSlcomment:340				currentState = SCommentCode341				return i, currentState, endString, endComments, false342343			case TMlcomment:344				if langFeatures.Nested || len(endComments) == 0 {345					endComments = append(endComments, endString)346					currentState = SMulticommentCode347					i += offsetJump - 1348349					return i, currentState, endString, endComments, false350				}351352			case TComplexity:353				if index == 0 || isWhitespace(fileJob.Content[index-1]) {354					fileJob.Complexity++355					fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1356				}357358			case TComplexityPostfix:359				countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)360			}361		}362	}363364	return index, currentState, endString, endComments, false365}366367func commentState(fileJob *FileJob, index int, endPoint int, currentState int64, endComments [][]byte, endString []byte, langFeatures LanguageFeature) (int, int64, []byte, [][]byte) {368	for i := index; i < endPoint; i++ {369		curByte := fileJob.Content[i]370		index = i371372		if fileJob.ContentByteType != nil {373			fileJob.ContentByteType[i] = ByteTypeComment374		}375376		if curByte == '\n' {377			return i, currentState, endString, endComments378		}379380		if checkForMatchSingle(curByte, index, endPoint, endComments[len(endComments)-1], fileJob) {381			// set offset jump here382			offsetJump := len(endComments[len(endComments)-1])383			endComments = endComments[:len(endComments)-1]384385			if len(endComments) == 0 {386				// If we started as multiline code switch back to code so we count correctly387				// IE i := 1 /* for the lols */388				// TODO is that required? Might still be required to count correctly389				if currentState == SMulticommentCode {390					currentState = SCode // TODO pointless to change here, just set S_MULTICOMMENT_BLANK391				} else {392					currentState = SMulticommentBlank393				}394			}395396			i += offsetJump - 1397			return i, currentState, endString, endComments398		}399		// Check if we are entering another multiline comment400		// This should come below check for match single as it speeds up processing401		if langFeatures.Nested || len(endComments) == 0 {402			if ok, offsetJump, endString := langFeatures.MultiLineComments.Match(fileJob.Content[i:]); ok != 0 {403				endComments = append(endComments, endString)404				i += offsetJump - 1405406				return i, currentState, endString, endComments407			}408		}409	}410411	return index, currentState, endString, endComments412}413414func blankState(415	fileJob *FileJob,416	index int,417	currentState int64,418	endComments [][]byte,419	endString []byte,420	langFeatures LanguageFeature,421) (int, int64, []byte, [][]byte, bool) {422	switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[index:]); tokenType {423	case TMlcomment:424		if langFeatures.Nested || len(endComments) == 0 {425			endComments = append(endComments, endString)426			currentState = SMulticomment427			index += offsetJump - 1428			if fileJob.ContentByteType != nil {429				fileJob.ContentByteType[index] = ByteTypeComment430			}431			return index, currentState, endString, endComments, false432		}433434	case TSlcomment:435		currentState = SComment436		if fileJob.ContentByteType != nil {437			fileJob.ContentByteType[index] = ByteTypeComment438		}439		return index, currentState, endString, endComments, false440441	case TString:442		index, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index)443444		for _, v := range langFeatures.Quotes {445			if v.End == string(endString) && v.DocString {446				currentState = SDocString447				if fileJob.ContentByteType != nil {448					fileJob.ContentByteType[index] = ByteTypeComment449				}450				return index, currentState, endString, endComments, ignoreEscape451			}452		}453		currentState = SString454		if fileJob.ContentByteType != nil {455			fileJob.ContentByteType[index] = ByteTypeString456		}457		return index, currentState, endString, endComments, ignoreEscape458459	case TComplexity:460		currentState = SCode461		if fileJob.ContentByteType != nil {462			fileJob.ContentByteType[index] = ByteTypeCode463		}464		if index == 0 || isWhitespace(fileJob.Content[index-1]) {465			fileJob.Complexity++466			fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] = fileJob.ComplexityLine[len(fileJob.ComplexityLine)-1] + 1467		}468469	case TComplexityPostfix:470		currentState = SCode471		if fileJob.ContentByteType != nil {472			fileJob.ContentByteType[index] = ByteTypeCode473		}474		countComplexityPostfix(fileJob, index, offsetJump, langFeatures.PostfixExcludes)475476	default:477		currentState = SCode478		if fileJob.ContentByteType != nil {479			fileJob.ContentByteType[index] = ByteTypeCode480		}481	}482483	return index, currentState, endString, endComments, false484}485486// Some languages such as C# have quoted strings like @"\" where no escape character is required487// this checks if there is one so we can cater for these cases488func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index int) (int, bool) {489	ignoreEscape := false490491	// loop over the string states and if we have the special flag match, and if so we need to ensure we can handle them492	for i := 0; i < len(langFeatures.Quotes); i++ {493		if langFeatures.Quotes[i].DocString || langFeatures.Quotes[i].IgnoreEscape {494			// If so we need to check if where we are falls into these conditions495			isMatch := true496			for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {497				if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {498					isMatch = false499					break500				}501			}502503			// If we have a match then jump ahead enough so we don't pick it up again for cases like @"504			if isMatch {505				ignoreEscape = true506				index = index + len(langFeatures.Quotes[i].Start)507			}508		}509	}510511	return index, ignoreEscape512}513514// CountStats will process the fileJob515// If the file contains anything even just a newline its line count should be >= 1.516// If the file has a size of 0 its line count should be 0.517// Newlines belong to the line they started on so a file of \n means only 1 line518// This is the 'hot' path for the application and needs to be as fast as possible519func CountStats(fileJob *FileJob) {520	// For determining duplicates we need the below. The reason for creating521	// the byte array here is to avoid GC pressure. MD5 is in the standard library522	// and is fast enough to not warrant murmur3 hashing. No need to be523	// crypto secure here either so no need to eat the performance cost of a better524	// hash method525	if Duplicates {526		fileJob.Hash, _ = blake2b.New256(nil)527	}528529	// If the file has a length of 0 it is empty then we say it has no lines530	if fileJob.Bytes == 0 {531		fileJob.Lines = 0532		return533	}534535	LanguageFeaturesMutex.Lock()536	langFeatures := LanguageFeatures[fileJob.Language]537	LanguageFeaturesMutex.Unlock()538539	if langFeatures.Complexity == nil {540		langFeatures.Complexity = &Trie{}541	}542	if langFeatures.SingleLineComments == nil {543		langFeatures.SingleLineComments = &Trie{}544	}545	if langFeatures.MultiLineComments == nil {546		langFeatures.MultiLineComments = &Trie{}547	}548	if langFeatures.Strings == nil {549		langFeatures.Strings = &Trie{}550	}551	if langFeatures.Tokens == nil {552		langFeatures.Tokens = &Trie{}553	}554555	endPoint := int(fileJob.Bytes - 1)556	currentState := SBlank557	endComments := [][]byte{}558	endString := []byte{}559560	// TODO needs to be set via langFeatures.Quotes[0].IgnoreEscape for the matching feature561	ignoreEscape := false562	fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)563564	if fileJob.ClassifyContent {565		fileJob.ContentByteType = make([]byte, fileJob.Bytes)566	}567568	for index := checkBomSkip(fileJob); index < int(fileJob.Bytes); index++ {569		if fileJob.ContentByteType != nil {570			fileJob.ContentByteType[index] = stateToByteType(currentState)571		}572573		// Based on our current state determine if the state should change by checking574		// what the character is. The below is very CPU bound so need to be careful if575		// changing anything in here and profile/measure afterwards!576		// NB that the order of the if statements matters and has been set to what in benchmarks is most efficient577		if !isWhitespace(fileJob.Content[index]) {578579			switch currentState {580			case SCode:581				index, currentState, endString, endComments, ignoreEscape = codeState(582					fileJob,583					index,584					endPoint,585					currentState,586					endString,587					endComments,588					langFeatures,589					&fileJob.Hash,590				)591			case SString:592				index, currentState = stringState(fileJob, index, endPoint, endString, currentState, ignoreEscape)593			case SDocString:594				// For a docstring we can either move into blank in which case we count it as a docstring595				// or back into code in which case it should be counted as code596				index, currentState = docStringState(fileJob, index, endPoint, endString, currentState)597			case SMulticomment, SMulticommentCode:598				index, currentState, endString, endComments = commentState(599					fileJob,600					index,601					endPoint,602					currentState,603					endComments,604					endString,605					langFeatures,606				)607			case SBlank, SMulticommentBlank:608				// From blank we can move into comment, move into a multiline comment609				// or move into code but we can only do one.610				index, currentState, endString, endComments, ignoreEscape = blankState(611					fileJob,612					index,613					currentState,614					endComments,615					endString,616					langFeatures,617				)618			}619		}620621		// We shouldn't normally need this, but unclosed strings or comments622		// might leave the index past the end of the file when we reach this623		// point.624		if index >= len(fileJob.Content) {625			return626		}627628		// Only check the first 10000 characters for null bytes indicating a binary file629		// and if we find it then we return otherwise carry on and ignore binary markers630		if index < 10000 && fileJob.Binary {631			return632		}633634		// This means the end of processing the line so calculate the stats according to what state635		// we are currently in636		if fileJob.Content[index] == '\n' || index >= endPoint {637			fileJob.Lines++638			fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)639640			if NoLarge && fileJob.Lines >= LargeLineCount {641				// Save memory by unsetting the content as we no longer require it642				fileJob.Content = nil643				return644			}645646			switch currentState {647			case SCode, SString, SCommentCode, SMulticommentCode:648				fileJob.Code++649				currentState = resetState(currentState)650				if fileJob.Callback != nil {651					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_CODE) {652						return653					}654				}655				if Trace {656					// Don't remove the outside if-statements, for performance657					printTraceF("%s line %d ended with state: %d: counted as code", fileJob.Location, fileJob.Lines, currentState)658				}659			case SComment, SMulticomment, SMulticommentBlank:660				fileJob.Comment++661				currentState = resetState(currentState)662				if fileJob.Callback != nil {663					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {664						return665					}666				}667				if Trace {668					// Same as above669					printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)670				}671			case SBlank:672				fileJob.Blank++673				if fileJob.Callback != nil {674					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_BLANK) {675						return676					}677				}678				if Trace {679					// Same as above680					printTraceF("%s line %d ended with state: %d: counted as blank", fileJob.Location, fileJob.Lines, currentState)681				}682			case SDocString:683				fileJob.Comment++684				if fileJob.Callback != nil {685					if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) {686						return687					}688				}689				if Trace {690					// Same as above691					printTraceF("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)692				}693			}694		}695	}696697	if UlocMode {698		uloc := map[string]struct{}{}699		for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {700			uloc[l] = struct{}{}701		}702		fileJob.Uloc = len(uloc)703	}704705	if MaxMean {706		for l := range strings.SplitSeq(strings.TrimRight(string(fileJob.Content), "\n"), "\n") {707			fileJob.LineLength = append(fileJob.LineLength, len(l))708		}709	}710711	isGenerated := false712713	if Generated {714		headLen := min(1000, len(fileJob.Content))715		head := bytes.ToLower(fileJob.Content[0:headLen])716		for _, marker := range GeneratedMarkers {717			if bytes.Contains(head, bytes.ToLower([]byte(marker))) {718				fileJob.Generated = true719				fileJob.Language = fileJob.Language + " (gen)"720				isGenerated = true721				printWarnF("%s identified as isGenerated with heading comment", fileJob.Filename)722				break723			}724		}725	}726727	// check if 0 as well to avoid divide by zero https://github.com/boyter/scc/issues/223728	if !isGenerated && Minified && fileJob.Lines != 0 {729		avgLineByteCount := len(fileJob.Content) / int(fileJob.Lines)730		minifiedGeneratedCheck(avgLineByteCount, fileJob)731	}732733	fileJob.ComplexityLine = fileJob.ComplexityLine[:fileJob.Lines]734}735736func minifiedGeneratedCheck(avgLineByteCount int, fileJob *FileJob) {737	if avgLineByteCount >= MinifiedGeneratedLineByteLength {738		fileJob.Minified = true739		fileJob.Language = fileJob.Language + " (min)"740		printWarnF("%s identified as minified/generated with average line byte length of %d >= %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)741	} else {742		printDebugF("%s not identified as minified/generated with average line byte length of %d < %d", fileJob.Filename, avgLineByteCount, MinifiedGeneratedLineByteLength)743	}744}745746// Check if we have any Byte Order Marks (BOM) in front of the file747func checkBomSkip(fileJob *FileJob) int {748	// UTF-8 BOM which if detected we should skip the BOM as we can then count correctly749	// []byte is UTF-8 BOM taken from https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding750	if bytes.HasPrefix(fileJob.Content, []byte{239, 187, 191}) {751		printWarnF("UTF-8 BOM found for file %s skipping 3 bytes", fileJob.Filename)752		return 3753	}754755	// If we have one of the other BOM then we might not be able to count correctly so if verbose let the user know756	if Verbose {757		for _, v := range ByteOrderMarks {758			if bytes.HasPrefix(fileJob.Content, v) {759				printWarnF("BOM found for file %s indicating it is not ASCII/UTF-8 and may be counted incorrectly or ignored as a binary file", fileJob.Filename)760			}761		}762	}763764	return 0765}766767// Reads and processes files from input chan in parallel, and sends results to768// output chan769func (ctx processorContext) fileProcessorWorker(input chan *FileJob, output chan *FileJob) {770	var startTime int64771	var fileCount int64772	var gcEnabled int64773	var wg sync.WaitGroup774775	for i := 0; i < FileProcessJobWorkers; i++ {776		wg.Add(1)777		go func() {778			reader := NewFileReader()779780			for job := range input {781				atomic.CompareAndSwapInt64(&startTime, 0, makeTimestampMilli())782783				loc := job.Location784				if job.Symlocation != "" {785					loc = job.Symlocation786				}787788				fileStartTime := makeTimestampNano()789				content, err := reader.ReadFile(loc, int(job.Bytes))790				atomic.AddInt64(&fileCount, 1)791792				if atomic.LoadInt64(&gcEnabled) == 0 && atomic.LoadInt64(&fileCount) >= int64(GcFileCount) {793					debug.SetGCPercent(gcPercent)794					atomic.AddInt64(&gcEnabled, 1)795					printWarn("read file limit exceeded GC re-enabled")796				}797798				printTraceF("nanoseconds read into memory: %s: %d", job.Location, makeTimestampNano()-fileStartTime)799800				if err == nil {801					job.Content = content802					if ctx.processFile(job) {803						output <- job804					}805				} else {806					printWarnF("error reading: %s %s", job.Location, err)807				}808			}809810			wg.Done()811		}()812	}813814	go func() {815		wg.Wait()816		close(output)817818		printDebugF("milliseconds reading files into memory: %d", makeTimestampMilli()-startTime)819	}()820}821822// Process a single file823// File must have been read to job.Content already824func (ctx processorContext) processFile(job *FileJob) bool {825	fileStartTime := makeTimestampNano()826827	contents := job.Content828829	// Needs to always run to ensure the language is set830	job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)831832	remapped := false833	if len(ctx.remap.all) != 0 {834		ctx.hardRemapLanguage(job)835	}836837	// If the type is #! we should check to see if we can identify838	if job.Language == SheBang {839		if len(ctx.remap.unknown) != 0 {840			remapped = ctx.unknownRemapLanguage(job)841		}842843		// if we didn't remap we then want to see if it's a #! map844		if !remapped {845			cutoff := min(200, len(contents))846847			lang, err := DetectSheBang(string(contents[:cutoff]))848			if err != nil {849				printWarnF("unable to determine #! language for %s", job.Location)850				return false851			}852853			printWarnF("detected #! %s for %s", lang, job.Location)854			job.Language = lang855			LoadLanguageFeature(lang)856		}857	}858859	CountStats(job)860861	if Duplicates {862		duplicates.mux.Lock()863		jobHash := job.Hash.Sum(nil)864		if duplicates.Check(job.Bytes, jobHash) {865			printWarnF("skipping duplicate file: %s", job.Location)866			duplicates.mux.Unlock()867			return false868		}869870		duplicates.Add(job.Bytes, jobHash)871		duplicates.mux.Unlock()872	}873874	if IgnoreMinified && job.Minified {875		printWarnF("skipping minified file: %s", job.Location)876		return false877	}878879	if IgnoreGenerated && job.Generated {880		printWarnF("skipping generated file: %s", job.Location)881		return false882	}883884	if NoLarge && job.Lines >= LargeLineCount {885		printWarnF("skipping large file due to line length: %s", job.Location)886		return false887	}888889	printTraceF("nanoseconds process: %s: %d", job.Location, makeTimestampNano()-fileStartTime)890891	if job.Binary {892		printWarnF("skipping file identified as binary: %s", job.Location)893		return false894	}895896	// This needs to be at the end so we can ensure duplicate detection et.al run first897	// avoiding inflating the counts898	if UlocMode {899		ulocMutex.Lock()900901		for l := range strings.SplitSeq(strings.TrimRight(string(job.Content), "\n"), "\n") {902			ulocGlobalCount[l] = struct{}{}903904			_, ok := ulocLanguageCount[job.Language]905			if !ok {906				ulocLanguageCount[job.Language] = map[string]struct{}{}907			}908			ulocLanguageCount[job.Language][l] = struct{}{}909		}910		ulocMutex.Unlock()911	}912913	return true914}915916func (ctx processorContext) hardRemapLanguage(job *FileJob) bool {917	remapped := false918	cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look919920	for _, rule := range ctx.remap.all {921		if bytes.Contains(job.Content[:cutoff], rule.pattern) {922			job.Language = rule.language923			remapped = true924			printWarnF("hard remapping: %s to %s", job.Location, job.Language)925		}926	}927928	return remapped929}930931func (ctx processorContext) unknownRemapLanguage(job *FileJob) bool {932	remapped := false933	cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look934935	for _, rule := range ctx.remap.unknown {936		if bytes.Contains(job.Content[:cutoff], rule.pattern) {937			job.Language = rule.language938			remapped = true939			printWarnF("unknown remapping: %s to %s", job.Location, job.Language)940		}941	}942943	return remapped944}
Code quality findings 12

Blank identifier discarding results; verify intentional ignoring of return values
L526
warning correctness blank-identifier-discard
fileJob.Hash, _ = blake2b.New256(nil)
Ensure errors are handled or logged
L848
warning correctness unhandled-error
if err != nil {
Ensure paired with Unlock defer to prevent deadlocks
L899
warning correctness lock-without-unlock
ulocMutex.Lock()
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
L253
info maintainability deep-nesting
if fileJob.Content[i] == '\n' {
Multiple appends without pre-allocation; use make() with capacity when size is known
L403
info performance append-without-prealloc
endComments = append(endComments, endString)
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
L496
info maintainability deep-nesting
for j := 0; j < len(langFeatures.Quotes[i].Start); j++ {
Deeply nested control structures reduce readability; consider extracting to functions or using early returns
L497
info maintainability deep-nesting
if len(fileJob.Content) <= index+j || fileJob.Content[index+j] != langFeatures.Quotes[i].Start[j] {
Multiple appends without pre-allocation; use make() with capacity when size is known
L562
info performance append-without-prealloc
fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)
Multiple appends without pre-allocation; use make() with capacity when size is known
L638
info performance append-without-prealloc
fileJob.ComplexityLine = append(fileJob.ComplexityLine, 0)
Multiple appends without pre-allocation; use make() with capacity when size is known
L707
info performance append-without-prealloc
fileJob.LineLength = append(fileJob.LineLength, len(l))
String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop
L717
info correctness string-to-byte-in-loop
if bytes.Contains(head, bytes.ToLower([]byte(marker))) {
Adjusting garbage collection settings dynamically can be a sign of deeper problems in the codebase, suggesting a need for better coding practices
L793
info correctness gc-tuning
debug.SetGCPercent(gcPercent)
Code quality findings 12

Get this view in your editor