processor/processor.go · boyter/scc

1// SPDX-License-Identifier: MIT23package processor45import (6	"fmt"7	"io"8	"os"9	"path/filepath"10	"regexp"11	"runtime"12	"runtime/debug"13	"slices"14	"strconv"15	"strings"16	"sync"1718	"github.com/boyter/gocodewalker"19)2021// Version indicates the version of the application22var Version = "3.8.0 (beta)"2324// Flags set via the CLI which control how the output is displayed2526// Files indicates if there should be file output or not when formatting27var Files = false2829// Languages indicates if the command line should print out the supported languages30var Languages = false3132// Verbose enables verbose logging output33var Verbose = false3435// Debug enables debug logging output36var Debug = false3738// Trace enables trace logging output which is extremely verbose39var Trace = false4041// Duplicates enables duplicate file detection42var Duplicates = false4344// MinifiedGenerated enables minified/generated file detection45var MinifiedGenerated = false4647// IgnoreMinifiedGenerate printing counts for minified/generated files48var IgnoreMinifiedGenerate = false4950// MinifiedGeneratedLineByteLength number of bytes per average line to determine file is minified/generated51var MinifiedGeneratedLineByteLength = 2555253// Minified enables minified file detection54var Minified = false5556// IgnoreMinified ignore printing counts for minified files57var IgnoreMinified = false5859// Generated enables generated file detection60var Generated = false6162// GeneratedMarkers defines head markers for generated file detection63var GeneratedMarkers []string6465// IgnoreGenerated ignore printing counts for generated files66var IgnoreGenerated = false6768// Complexity toggles complexity calculation69var Complexity = false7071// More enables wider output with more information in formatter72var More = false7374// Cocomo toggles the COCOMO calculation75var Cocomo = false7677// SLOCCountFormat prints a more SLOCCount like COCOMO calculation78var SLOCCountFormat = false7980// CocomoProjectType allows the flipping between project types which impacts the calculation81var CocomoProjectType = "organic"8283// Size toggles the Size calculation84var Size = false8586// Draw horizontal borders between sections.87var HBorder = false8889// SizeUnit determines what size calculation is used for megabytes90var SizeUnit = "si"9192// Ci indicates if running inside a CI so to disable box drawing characters93var Ci = false9495// GitIgnore disables .gitignore checks96var GitIgnore = false9798// GitModuleIgnore disables .gitmodules checks99var GitModuleIgnore = false100101// Ignore disables ignore file checks102var Ignore = false103104// SccIgnore disables sccignore file checks105var SccIgnore = false106107// CountIgnore should we count ignore files?108var CountIgnore = false109110// DisableCheckBinary toggles checking for binary files using NUL bytes111var DisableCheckBinary = false112113// UlocMode toggles checking for binary files using NUL bytes114var UlocMode = false115116// Percent toggles checking for binary files using NUL bytes117var Percent = false118119// MaxMean sets the calculation of the max and mean line length120var MaxMean = false121122// Dryness toggles checking for binary files using NUL bytes123var Dryness = false124125// SortBy sets which column output in formatter should be sorted by126var SortBy = ""127128// Exclude is a regular expression which is used to exclude files from being processed129var Exclude = []string{}130131// CountAs is a rule for mapping known or new extensions to other rules132var CountAs = ""133134// Format sets the output format of the formatter135var Format = ""136137// FormatMulti is a rule for defining multiple output formats138var FormatMulti = ""139140// SQLProject is used to store the name for the SQL insert formats but is optional141var SQLProject = ""142143// RemapUnknown allows remapping of unknown files with a string to search the content for144var RemapUnknown = ""145146// RemapAll allows remapping of all files with a string to search the content for147var RemapAll = ""148149type remapRule struct {150	pattern  []byte151	language string152}153154type remapConfig struct {155	all     []remapRule156	unknown []remapRule157}158159type processorContext struct {160	remap remapConfig161}162163func parseRemapRules(value string) []remapRule {164	rules := []remapRule{}165166	for s := range strings.SplitSeq(value, ",") {167		t := strings.Split(s, ":")168		if len(t) == 2 {169			rules = append(rules, remapRule{170				pattern:  []byte(t[0]),171				language: t[1],172			})173		}174	}175176	return rules177}178179func newRemapConfig(remapAll string, remapUnknown string) remapConfig {180	return remapConfig{181		all:     parseRemapRules(remapAll),182		unknown: parseRemapRules(remapUnknown),183	}184}185186// MatchEngine selects how a CountRule pattern is interpreted. Glob is the187// default; regex is opt-in via the re: prefix.188type MatchEngine int189190const (191	// MatchGlob is the default. The pattern is a glob ('*' and '?') translated192	// to an anchored regex and matched as a full match against the path.193	MatchGlob MatchEngine = iota194	// MatchRegex treats the pattern as a raw (unanchored) RE2 regex. Opt in195	// with the re: prefix.196	MatchRegex197)198199// CountRule is the typed, library-facing form of a --count-as-pattern rule.200// It matches files by their path and relabels them to a new named category201// whose counting rules are cloned from an existing base language.202type CountRule struct {203	Engine       MatchEngine // MatchGlob (the default) or MatchRegex204	Pattern      string      // glob or regex source205	Name         string      // new category display name206	BaseLanguage string      // existing language whose counting rules are cloned207}208209// CountRules is the typed input set either directly by library users or by the210// CLI after parsing CountAsPattern. Setup happens in setupCountRules.211var CountRules []CountRule212213// CountAsPattern holds the raw repeatable --count-as-pattern flag values. Each214// is parsed into a CountRule at setup. Library users may set CountRules directly.215var CountAsPattern []string216217// compiledCountRule is the runtime form scanned by newFileJob218type compiledCountRule struct {219	re   *regexp.Regexp220	name string221}222223var compiledCountRules []compiledCountRule224225// CurrencySymbol allows setting the currency symbol for cocomo project cost estimation226var CurrencySymbol = ""227228// FileOutput sets the file that output should be written to229var FileOutput = ""230231// PathDenyList sets the paths that should be skipped232var PathDenyList = []string{}233234// FileListQueueSize is the queue of files found and ready to be read into memory235var FileListQueueSize = runtime.NumCPU()236237// FileProcessJobWorkers is the number of workers that process the file collecting stats238var FileProcessJobWorkers = runtime.NumCPU() * 4239240// FileSummaryJobQueueSize is the queue used to hold processed file statistics before formatting241var FileSummaryJobQueueSize = runtime.NumCPU()242243// DirectoryWalkerJobWorkers is the number of workers which will walk the directory tree244var DirectoryWalkerJobWorkers = 8245246// AllowListExtensions is a list of extensions which are allowed to be processed247var AllowListExtensions = []string{}248249// ExcludeListExtensions is a list of extensions which should be ignored250var ExcludeListExtensions = []string{}251252// ExcludeFilename is a list of filenames which should be ignored253var ExcludeFilename = []string{}254255// AverageWage is the average wage in dollars used for the COCOMO cost estimate256var AverageWage int64 = 56286257258// Overhead is the overhead multiplier for corporate overhead (facilities, equipment, accounting, etc.)259var Overhead float64 = 2.4260261// EAF is the effort adjustment factor derived from the cost drivers, i.e. 1.0 if rated nominal262var EAF float64 = 1.0263264// Locomo toggles the LOCOMO (LLM Output COst MOdel) calculation265var Locomo = false266267// CostComparison enables both COCOMO and LOCOMO output for side-by-side comparison268var CostComparison = false269270// LocomoPresetName is the LLM model preset for pricing and throughput defaults271var LocomoPresetName = "medium"272273// LocomoInputPrice is the cost per 1M input tokens (overrides preset)274var LocomoInputPrice float64275var LocomoInputPriceSet = false276277// LocomoOutputPrice is the cost per 1M output tokens (overrides preset)278var LocomoOutputPrice float64279var LocomoOutputPriceSet = false280281// LocomoTPS is the output tokens per second (overrides preset)282var LocomoTPS float64283var LocomoTPSSet = false284285// LocomoReviewMinutesPerLine is the human review time per line of code in minutes286var LocomoReviewMinutesPerLine float64 = 0.01287288// LocomoConfig is the power-user config string "tokensPerLine,baseInputPerLine,complexityWeight,iterations,iterationWeight"289var LocomoConfig = ""290291// LocomoTokensPerLine is the average number of output tokens per line of code292var LocomoTokensPerLine float64 = 10293294// LocomoBaseInputPerLine is the base number of input tokens per output line295var LocomoBaseInputPerLine float64 = 20296297// LocomoComplexityWeight is the scaling weight applied to sqrt(complexity density) for input tokens298var LocomoComplexityWeight float64 = 5299300// LocomoIterations is the base number of iteration/retry attempts301var LocomoIterations float64 = 1.5302303// LocomoIterationWeight is the scaling weight for complexity-driven retries304var LocomoIterationWeight float64 = 2305306// LocomoCyclesOverride is the user-supplied iteration factor override (--locomo-cycles)307var LocomoCyclesOverride float64308309// LocomoCyclesSet indicates whether --locomo-cycles was explicitly set310var LocomoCyclesSet = false311312// GcFileCount is the number of files to process before turning the GC back on313var GcFileCount = 10000314var gcPercent = -1315var isLazy = false316317// NoLarge if set true will ignore files over a certain number of lines or bytes318var NoLarge = false319320// IncludeSymLinks if set true will count symlink files321var IncludeSymLinks = false322323// LargeLineCount number of lines before being counted as a large file based on https://github.com/pinpt/ripsrc/blob/master/ripsrc/fileinfo/fileinfo.go#L44324var LargeLineCount int64 = 40000325326// LargeByteCount number of bytes before being counted as a large file based on https://github.com/pinpt/ripsrc/blob/master/ripsrc/fileinfo/fileinfo.go#L44327var LargeByteCount int64 = 1000000328329// Hotspots toggles the hotspots git-history report330var Hotspots = false331332// ByAuthor toggles the author-rollup git-history report333var ByAuthor = false334335// Timeline selects an over-time view. With ByAuthor, runs the author336// timeline report (plan 04); alone, runs the languages-over-time report337// (plan 05). With Hotspots set, the combination errors out.338var Timeline = false339340// HistoryBuckets is the time-bucket resolution for the timeline reports.341// Wired to --buckets in main.go; default 60.342var HistoryBuckets = 60343344// FoldAuthors enables the name+domain identity folding fallback applied345// after the mailmap. Toggled off via --no-fold-authors.346var FoldAuthors = true347348// DirFilePaths is not set via flags but by arguments following the flags for file or directory to process349var DirFilePaths = []string{}350351// ExtensionToLanguage is loaded from the JSON that is in constants.go352var ExtensionToLanguage = map[string][]string{}353354// ShebangLookup loaded from the JSON in constants.go contains shebang lookups355var ShebangLookup = map[string][]string{}356357// FilenameToLanguage similar to ExtensionToLanguage loaded from the JSON in constants.go358var FilenameToLanguage = map[string]string{}359360// LanguageFeatures contains the processed languages from processLanguageFeature361var LanguageFeatures = map[string]LanguageFeature{}362363// LanguageFeaturesMutex is the shared mutex used to control getting and setting of language features364// used rather than sync.Map because it turned out to be marginally faster365var LanguageFeaturesMutex = sync.Mutex{}366367// Start time in milli seconds in case we want the total time368var startTimeMilli = makeTimestampMilli()369370// ConfigureGc needs to be set outside of ProcessConstants because it should only be enabled in command line371// mode https://github.com/boyter/scc/issues/32372func ConfigureGc() {373	gcPercent = debug.SetGCPercent(gcPercent)374}375376// ConfigureLazy is a simple setter used to turn on lazy loading used only by command line377func ConfigureLazy(lazy bool) {378	isLazy = lazy379}380381// ProcessConstants is responsible for setting up the language features based on the JSON file that is stored in constants382// Needs to be called at least once in order for anything to actually happen383func ProcessConstants() {384	startTime := makeTimestampNano()385	for name, value := range languageDatabase {386		for _, ext := range value.Extensions {387			ExtensionToLanguage[ext] = append(ExtensionToLanguage[ext], name)388		}389390		for _, fname := range value.FileNames {391			FilenameToLanguage[fname] = name392		}393394		if len(value.SheBangs) != 0 {395			ShebangLookup[name] = value.SheBangs396		}397	}398399	// If we have anything in CountAs set it up now400	if len(CountAs) != 0 {401		setupCountAs()402	}403404	printTraceF("nanoseconds build extension to language: %d", makeTimestampNano()-startTime)405406	// Set up any path pattern count rules, minting new categories backed by a407	// base language. The function clones the base language and builds its408	// features so counting works in both lazy and non-lazy modes.409	if len(CountAsPattern) != 0 || len(CountRules) != 0 {410		setupCountRules()411	}412413	// Configure COCOMO setting414	_, ok := projectType[strings.ToLower(CocomoProjectType)]415	if !ok {416		// let's see if we can turn it into a custom one417		spl := strings.Split(CocomoProjectType, ",")418		val := []float64{}419		if len(spl) == 5 {420			// let's try to convert to float if we can421			for i := 1; i < 5; i++ {422				f, err := strconv.ParseFloat(spl[i], 64)423				if err == nil {424					val = append(val, f)425				}426			}427		}428429		if len(val) == 4 {430			projectType[CocomoProjectType] = val431		} else {432			// if nothing matches fall back to organic433			CocomoProjectType = "organic"434		}435	}436437	// If lazy is set then we want to load in the features as we find them not in one go438	// however otherwise being used as a library so just load them all in439	if !isLazy {440		startTime = makeTimestampMilli()441		for name, value := range languageDatabase {442			processLanguageFeature(name, value)443		}444445		printTraceF("milliseconds build language features: %d", makeTimestampMilli()-startTime)446	} else {447		printTrace("configured to lazy load language features")448	}449450	// Fix for https://github.com/boyter/scc/issues/250451	fixedPath := make([]string, 0, len(PathDenyList))452	for _, path := range PathDenyList {453		fixedPath = append(fixedPath, strings.TrimRight(path, "/"))454	}455	PathDenyList = fixedPath456}457458// Configure and setup any count-as params the use has supplied459func setupCountAs() {460	for s := range strings.SplitSeq(CountAs, ",") {461		t := strings.Split(s, ":")462		if len(t) != 2 {463			printError(fmt.Sprintf("ignoring malformed count-as rule %q: expected format <from>:<to>", s))464			continue465		}466467		// There are two cases here.468		// first is they provide the name e.g. "Cargo Lock"469		// second is that the user supplies the extension EG wsdl470		// we should support BOTH cases471		// always remember we only need to validate t[1] as that's the one472		// that tells us where we are trying to map473		target, ok := resolveBaseLanguage(t[1])474		if ok {475			ExtensionToLanguage[strings.ToLower(t[0])] = []string{target}476			printDebugF("set to count extension: %s as language %s", t[0], target)477			continue478		}479480		// The target t[1] matched neither a known language name nor a known481		// extension, so no mapping was registered. Warn rather than silently482		// ignoring the rule, since count-as cannot mint new categories yet.483		printError(fmt.Sprintf("ignoring count-as rule %q: target %q is not a known language or extension", s, t[1]))484	}485}486487// resolveBaseLanguage resolves a user supplied target to a canonical language488// name. It first tries to match a language name (most reliable as names are489// unique) and falls back to matching a known extension. Returns the canonical490// language name and whether it was resolved.491func resolveBaseLanguage(target string) (string, bool) {492	// Match by language name which is the most reliable as the name is unique493	for name := range languageDatabase {494		if strings.EqualFold(name, target) {495			return name, true496		}497	}498499	// Fall back to extension match, note this is less reliable as some500	// languages share extensions so we take the first registered language501	langs, ok := ExtensionToLanguage[strings.ToLower(target)]502	if ok && len(langs) != 0 {503		return langs[0], true504	}505506	return "", false507}508509// parseCountAsPattern parses a single --count-as-pattern rule of the form510// [engine:]pattern:name:baselang into a CountRule.511//512// The engine prefix is optional and the pattern is treated as a GLOB BY513// DEFAULT; prefix with re: to opt into a regex (or glob: to be explicit). We514// keep glob and regex as distinct modes rather than inferring, because the same515// string is valid in both engines with different meaning (e.g. "foo.rb" matches516// only foo.rb as a glob but also fooXrb as a regex), so guessing would silently517// match the wrong files.518//519// Because regex patterns and paths legitimately contain ':', name and baselang520// are peeled from the right and the pattern is whatever remains in between.521func parseCountAsPattern(s string) (CountRule, error) {522	engine := MatchGlob523	rest := s524525	switch {526	case strings.HasPrefix(rest, "re:"):527		engine = MatchRegex528		rest = rest[len("re:"):]529	case strings.HasPrefix(rest, "glob:"):530		engine = MatchGlob531		rest = rest[len("glob:"):]532	}533534	// baselang = after the last ':', name = between the 2nd-last and last ':'535	lastColon := strings.LastIndex(rest, ":")536	if lastColon == -1 {537		return CountRule{}, fmt.Errorf("expected format [engine:]pattern:name:baselang")538	}539	baseLanguage := rest[lastColon+1:]540541	nameColon := strings.LastIndex(rest[:lastColon], ":")542	if nameColon == -1 {543		return CountRule{}, fmt.Errorf("expected format [engine:]pattern:name:baselang")544	}545	name := rest[nameColon+1 : lastColon]546	pattern := rest[:nameColon]547548	if pattern == "" || name == "" || baseLanguage == "" {549		return CountRule{}, fmt.Errorf("pattern, name and baselang must all be non-empty")550	}551552	return CountRule{Engine: engine, Pattern: pattern, Name: name, BaseLanguage: baseLanguage}, nil553}554555// globToRegex converts a simple glob into an anchored regex. Glob is the556// default --count-as-pattern engine. Only '*' (any run of characters) and '?'557// (single character) are special, everything else is matched literally. The558// result is anchored as a full match.559func globToRegex(glob string) string {560	var b strings.Builder561	b.WriteByte('^')562	for _, r := range glob {563		switch r {564		case '*':565			b.WriteString(".*")566		case '?':567			b.WriteByte('.')568		default:569			b.WriteString(regexp.QuoteMeta(string(r)))570		}571	}572	b.WriteByte('$')573	return b.String()574}575576// setupCountRules parses CountAsPattern into CountRules, compiles each rule and577// registers a cloned language under its new name so counting works. Invalid578// rules are reported to stderr and skipped, consistent with --count-as.579func setupCountRules() {580	for _, s := range CountAsPattern {581		rule, err := parseCountAsPattern(s)582		if err != nil {583			printError(fmt.Sprintf("ignoring malformed count-as-pattern rule %q: %s", s, err))584			continue585		}586		CountRules = append(CountRules, rule)587	}588589	for _, rule := range CountRules {590		base, ok := resolveBaseLanguage(rule.BaseLanguage)591		if !ok {592			printError(fmt.Sprintf("ignoring count-as-pattern rule for %q: base language %q is not a known language or extension", rule.Name, rule.BaseLanguage))593			continue594		}595596		source := rule.Pattern597		if rule.Engine == MatchGlob {598			source = globToRegex(rule.Pattern)599		}600601		re, err := regexp.Compile(source)602		if err != nil {603			printError(fmt.Sprintf("ignoring count-as-pattern rule for %q: invalid pattern %q: %s", rule.Name, rule.Pattern, err))604			continue605		}606607		// Clone the base language under the new name so it has counting rules,608		// clearing the matchers so the minted category never participates in609		// normal extension/filename/shebang detection.610		cloned := languageDatabase[base]611		cloned.Extensions = nil612		cloned.FileNames = nil613		cloned.SheBangs = nil614		languageDatabase[rule.Name] = cloned615616		// Populate features now in non-lazy mode, otherwise LoadLanguageFeature617		// will build them on first use since the name is in languageDatabase.618		if !isLazy {619			processLanguageFeature(rule.Name, cloned)620		}621622		compiledCountRules = append(compiledCountRules, compiledCountRule{re: re, name: rule.Name})623		printDebugF("set to count path matching %q as new language %s based on %s", rule.Pattern, rule.Name, base)624	}625}626627// LoadLanguageFeature will load a single feature as requested given the name628func LoadLanguageFeature(loadName string) {629	if !isLazy {630		return631	}632633	// Check if already loaded and if so return because we don't need to do it again634	LanguageFeaturesMutex.Lock()635	_, ok := LanguageFeatures[loadName]636	LanguageFeaturesMutex.Unlock()637	if ok {638		return639	}640641	var name string642	var value Language643644	for name, value = range languageDatabase {645		if name == loadName {646			break647		}648	}649650	startTime := makeTimestampNano()651	processLanguageFeature(loadName, value)652	printTraceF("nanoseconds to build language %s features: %d", loadName, makeTimestampNano()-startTime)653}654655func processLanguageFeature(name string, value Language) {656	complexityTrie := &Trie{}657	slCommentTrie := &Trie{}658	mlCommentTrie := &Trie{}659	stringTrie := &Trie{}660	tokenTrie := &Trie{}661	keywordBytes := make([][]byte, 0, len(value.Keywords))662	postfixExcludes := make([][]byte, 0, len(value.ComplexityChecksPostfixExcludes))663664	complexityMask := byte(0)665	singleLineCommentMask := byte(0)666	multiLineCommentMask := byte(0)667	stringMask := byte(0)668	processMask := byte(0)669670	for _, v := range value.ComplexityChecks {671		complexityMask |= v[0]672		complexityTrie.Insert(TComplexity, []byte(v))673		if !Complexity {674			tokenTrie.Insert(TComplexity, []byte(v))675		}676	}677	if !Complexity {678		processMask |= complexityMask679	}680681	for _, v := range value.ComplexityChecksPostfix {682		if !Complexity {683			tokenTrie.Insert(TComplexityPostfix, []byte(v))684			processMask |= v[0]685		}686	}687688	for _, v := range value.ComplexityChecksPostfixExcludes {689		postfixExcludes = append(postfixExcludes, []byte(v))690	}691692	for _, v := range value.LineComment {693		singleLineCommentMask |= v[0]694		slCommentTrie.Insert(TSlcomment, []byte(v))695		tokenTrie.Insert(TSlcomment, []byte(v))696	}697	processMask |= singleLineCommentMask698699	for _, v := range value.MultiLine {700		multiLineCommentMask |= v[0][0]701		mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))702		tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))703	}704	processMask |= multiLineCommentMask705706	for _, v := range value.Quotes {707		stringMask |= v.Start[0]708		stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))709		tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))710	}711	processMask |= stringMask712713	for _, v := range value.Keywords {714		keywordBytes = append(keywordBytes, []byte(v))715	}716717	LanguageFeaturesMutex.Lock()718	LanguageFeatures[name] = LanguageFeature{719		Complexity:            complexityTrie,720		MultiLineComments:     mlCommentTrie,721		MultiLine:             value.MultiLine,722		SingleLineComments:    slCommentTrie,723		LineComment:           value.LineComment,724		Strings:               stringTrie,725		Tokens:                tokenTrie,726		Nested:                value.NestedMultiLine,727		PostfixExcludes:       postfixExcludes,728		ComplexityCheckMask:   complexityMask,729		MultiLineCommentMask:  multiLineCommentMask,730		SingleLineCommentMask: singleLineCommentMask,731		StringCheckMask:       stringMask,732		ProcessMask:           processMask,733		Keywords:              value.Keywords,734		KeywordBytes:          keywordBytes,735		Quotes:                value.Quotes,736	}737	LanguageFeaturesMutex.Unlock()738}739740func processFlags() {741	// If wide/more mode is enabled we want the complexity calculation742	// to happen regardless as that is the only purpose of the flag743	if More && Complexity {744		Complexity = false745	}746747	// If ignore minified/generated is on ensure we turn on the code to calculate that748	if IgnoreMinifiedGenerate {749		MinifiedGenerated = true750		IgnoreMinified = true751		IgnoreGenerated = true752	}753754	if MinifiedGenerated {755		Minified = true756		Generated = true757	}758759	if IgnoreMinified {760		Minified = true761	}762763	if IgnoreGenerated {764		Generated = true765	}766767	if Dryness {768		UlocMode = true769	}770771	printDebugF("Path Deny List: %v", PathDenyList)772	printDebugF("Sort By: %s", SortBy)773	printDebugF("White List: %v", AllowListExtensions)774	printDebugF("Files Output: %t", Files)775	printDebugF("Verbose: %t", Verbose)776	printDebugF("Duplicates Detection: %t", Duplicates)777	printDebugF("Complexity Calculation: %t", !Complexity)778	printDebugF("Wide: %t", More)779	// If cost-comparison is enabled, turn on both COCOMO and LOCOMO780	if CostComparison {781		Cocomo = false782		Locomo = true783	}784785	// LOCOMO needs complexity data to produce accurate estimates.786	// If complexity was disabled via --no-complexity, force it back on.787	if Locomo && Complexity {788		Complexity = false789	}790791	printDebugF("Average Wage: %d", AverageWage)792	printDebugF("Cocomo: %t", !Cocomo)793	printDebugF("Locomo: %t", Locomo)794	printDebugF("Minified/Generated Detection: %t/%t", Minified, Generated)795	printDebugF("Ignore Minified/Generated: %t/%t", IgnoreMinified, IgnoreGenerated)796	printDebugF("IncludeSymLinks: %t", IncludeSymLinks)797	printDebugF("Uloc: %t", UlocMode)798	printDebugF("Dryness: %t", Dryness)799}800801// LanguageDatabase provides access to the internal language database802// useful for consuming applications wanting to consume and use803func LanguageDatabase() map[string]Language {804	return languageDatabase805}806807func PrintLanguages(dst io.Writer) {808	names := make([]string, 0, len(languageDatabase))809	for key := range languageDatabase {810		names = append(names, key)811	}812813	slices.SortFunc(names, func(a, b string) int {814		return strings.Compare(strings.ToLower(a), strings.ToLower(b))815	})816817	for _, name := range names {818		_, _ = fmt.Fprintf(dst, "%s (%s)\n", name, strings.Join(append(languageDatabase[name].Extensions, languageDatabase[name].FileNames...), ","))819	}820}821822// global variables to deal with ULOC calculations823var ulocMutex = sync.Mutex{}824var ulocGlobalCount = map[string]struct{}{}825var ulocLanguageCount = map[string]map[string]struct{}{}826827// Process is the main entry point of the command line it sets everything up and starts running828func Process() {829	if Languages {830		PrintLanguages(os.Stdout)831		return832	}833834	ProcessConstants()835	processFlags()836	cleanVisitedPaths()837838	// Clean up any invalid arguments before setting everything up839	if len(DirFilePaths) == 0 {840		DirFilePaths = append(DirFilePaths, ".")841	}842843	// --report mode short-circuits the normal format dispatch and writes a844	// self-contained HTML report. Mutually exclusive with --format / -f: if845	// the user passed both, warn on stderr and let --report win.846	if ReportOut != "" {847		if Format != "" && Format != "tabular" {848			fmt.Fprintf(os.Stderr, "warning: --report overrides --format=%s\n", Format)849		}850		parseReportSkip(ReportSkip)851		if len(DirFilePaths) > 1 {852			fmt.Fprintf(os.Stderr, "warning: --report only analyses the first positional path (%s); other paths ignored\n", DirFilePaths[0])853		}854		if err := runReport(DirFilePaths); err != nil {855			fmt.Println(err)856			os.Exit(1)857		}858		return859	}860861	if Hotspots && (ByAuthor || Timeline) {862		fmt.Println("--hotspots is mutually exclusive with --by-author / --timeline; pick one report")863		os.Exit(1)864	}865866	if Hotspots || ByAuthor || Timeline {867		if err := validateHistoryFlags(os.Stderr); err != nil {868			fmt.Println(err)869			os.Exit(1)870		}871	}872873	if Hotspots {874		if err := runHotspotsReport(DirFilePaths[0]); err != nil {875			fmt.Println(err)876			os.Exit(1)877		}878		return879	}880881	if ByAuthor && Timeline {882		if err := runAuthorTimelineReport(DirFilePaths[0]); err != nil {883			fmt.Println(err)884			os.Exit(1)885		}886		return887	}888889	if ByAuthor {890		if err := runAuthorsReport(DirFilePaths[0]); err != nil {891			fmt.Println(err)892			os.Exit(1)893		}894		return895	}896897	if Timeline {898		if err := runLanguagesTimelineReport(DirFilePaths[0]); err != nil {899			fmt.Println(err)900			os.Exit(1)901		}902		return903	}904905	filePaths := []string{}906	dirPaths := []string{}907908	// Check if the paths or files added exist and exit if not909	for _, f := range DirFilePaths {910		fpath := filepath.Clean(f)911912		s, err := os.Stat(fpath)913		if err != nil {914			fmt.Println("file or directory could not be read: " + fpath)915			os.Exit(1)916		}917918		if s.IsDir() {919			dirPaths = append(dirPaths, fpath)920		} else {921			filePaths = append(filePaths, fpath)922		}923	}924925	SortBy = strings.ToLower(SortBy)926	ctx := processorContext{remap: newRemapConfig(RemapAll, RemapUnknown)}927928	printDebugF("NumCPU: %d", runtime.NumCPU())929	printDebugF("SortBy: %s", SortBy)930	printDebugF("PathDenyList: %v", PathDenyList)931932	potentialFilesQueue := make(chan *gocodewalker.File, FileListQueueSize) // files that pass the .gitignore checks933	fileListQueue := make(chan *FileJob, FileListQueueSize)                 // Files ready to be read from disk934	fileSummaryJobQueue := make(chan *FileJob, FileSummaryJobQueueSize)     // Files ready to be summarised935936	fileWalker := gocodewalker.NewParallelFileWalker(dirPaths, potentialFilesQueue)937	fileWalker.SetErrorHandler(func(e error) bool {938		printError(e.Error())939		return true940	})941	fileWalker.IgnoreGitIgnore = GitIgnore942	fileWalker.IgnoreIgnoreFile = Ignore943	fileWalker.IgnoreGitModules = GitModuleIgnore944	fileWalker.IncludeHidden = true945	fileWalker.ExcludeDirectory = PathDenyList946	fileWalker.SetConcurrency(DirectoryWalkerJobWorkers)947948	if !SccIgnore {949		fileWalker.CustomIgnore = []string{".sccignore"}950	}951952	var excludePathRegexes []*regexp.Regexp953	for _, exclude := range Exclude {954		regexpResult, err := regexp.Compile(exclude)955		if err == nil {956			fileWalker.ExcludeFilenameRegex = append(fileWalker.ExcludeFilenameRegex, regexpResult)957			fileWalker.ExcludeDirectoryRegex = append(fileWalker.ExcludeDirectoryRegex, regexpResult)958			excludePathRegexes = append(excludePathRegexes, regexpResult)959		} else {960			printError(err.Error())961		}962	}963964	go func() {965		err := fileWalker.Start()966		if err != nil {967			printError(err.Error())968		}969	}()970971	go func() {972		for _, f := range filePaths {973			fileInfo, err := os.Lstat(f)974			if err != nil {975				continue976			}977978			fileJob := newFileJob(f, f, fileInfo)979			if fileJob != nil {980				fileListQueue <- fileJob981			}982		}983984		for fi := range potentialFilesQueue {985			shouldExclude := false986			for _, re := range excludePathRegexes {987				if re.MatchString(fi.Location) {988					shouldExclude = true989					break990				}991			}992			if shouldExclude {993				continue994			}995996			fileInfo, err := os.Lstat(fi.Location)997			if err != nil {998				continue999			}10001001			if !fileInfo.IsDir() {1002				fileJob := newFileJob(fi.Location, fi.Filename, fileInfo)1003				if fileJob != nil {1004					fileListQueue <- fileJob1005				}1006			}1007		}1008		close(fileListQueue)1009	}()10101011	go ctx.fileProcessorWorker(fileListQueue, fileSummaryJobQueue)10121013	result := fileSummarize(fileSummaryJobQueue)1014	if FileOutput == "" {1015		fmt.Print(result)1016	} else {1017		_ = os.WriteFile(FileOutput, []byte(result), 0644)1018		fmt.Println("results written to " + FileOutput)1019	}1020}

Code quality findings 48

Declared map variable without initialization; writing to a nil map causes a panic. Use make() to initialize

L352

warning correctness nil-map-write

var ExtensionToLanguage = map[string][]string{}

Declared map variable without initialization; writing to a nil map causes a panic. Use make() to initialize

L355

warning correctness nil-map-write

var ShebangLookup = map[string][]string{}

Declared map variable without initialization; writing to a nil map causes a panic. Use make() to initialize

L358

warning correctness nil-map-write

var FilenameToLanguage = map[string]string{}

Declared map variable without initialization; writing to a nil map causes a panic. Use make() to initialize

L361

warning correctness nil-map-write

var LanguageFeatures = map[string]LanguageFeature{}

Ensure errors are handled or logged

L582

warning correctness unhandled-error

if err != nil {

Ensure errors are handled or logged

L602

warning correctness unhandled-error

if err != nil {

Ensure paired with Unlock defer to prevent deadlocks

L717

warning correctness lock-without-unlock

LanguageFeaturesMutex.Lock()

Blank identifier discarding results; verify intentional ignoring of return values

L818

warning correctness blank-identifier-discard

_, _ = fmt.Fprintf(dst, "%s (%s)\n", name, strings.Join(append(languageDatabase[name].Extensions, languageDatabase[name].FileNames...), ","))

Goroutine without waitgroup or channel; risks resource leaks or race conditions

L964

warning correctness goroutine-without-sync

go func() {

Ensure errors are handled or logged

L997

warning correctness unhandled-error

if err != nil {

Blank identifier discarding results; verify intentional ignoring of return values

L1017

warning correctness blank-identifier-discard

_ = os.WriteFile(FileOutput, []byte(result), 0644)

Multiple appends without pre-allocation; use make() with capacity when size is known

L169

info performance append-without-prealloc

rules = append(rules, remapRule{

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L170

info correctness string-to-byte-in-loop

pattern: []byte(t[0]),

Adjusting garbage collection settings dynamically can be a sign of deeper problems in the codebase, suggesting a need for better coding practices

L373

info correctness gc-tuning

gcPercent = debug.SetGCPercent(gcPercent)

Range over slice copies each element by value; use index or pointer receiver for large structs to avoid copies

L385

info performance copy-large-struct

for name, value := range languageDatabase {

Multiple appends without pre-allocation; use make() with capacity when size is known

L387

info performance append-without-prealloc

ExtensionToLanguage[ext] = append(ExtensionToLanguage[ext], name)

Multiple appends without pre-allocation; use make() with capacity when size is known

L424

info performance append-without-prealloc

val = append(val, f)

Range over slice copies each element by value; use index or pointer receiver for large structs to avoid copies

L441

info performance copy-large-struct

for name, value := range languageDatabase {

Multiple appends without pre-allocation; use make() with capacity when size is known

L453

info performance append-without-prealloc

fixedPath = append(fixedPath, strings.TrimRight(path, "/"))

Multiple appends without pre-allocation; use make() with capacity when size is known

L586

info performance append-without-prealloc

CountRules = append(CountRules, rule)

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L672

info correctness string-to-byte-in-loop

complexityTrie.Insert(TComplexity, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L674

info correctness string-to-byte-in-loop

tokenTrie.Insert(TComplexity, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L683

info correctness string-to-byte-in-loop

tokenTrie.Insert(TComplexityPostfix, []byte(v))

Multiple appends without pre-allocation; use make() with capacity when size is known

L689

info performance append-without-prealloc

postfixExcludes = append(postfixExcludes, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L689

info correctness string-to-byte-in-loop

postfixExcludes = append(postfixExcludes, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L694

info correctness string-to-byte-in-loop

slCommentTrie.Insert(TSlcomment, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L695

info correctness string-to-byte-in-loop

tokenTrie.Insert(TSlcomment, []byte(v))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L701

info correctness string-to-byte-in-loop

mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L702

info correctness string-to-byte-in-loop

tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L708

info correctness string-to-byte-in-loop

stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L709

info correctness string-to-byte-in-loop

tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop

L714

info correctness string-to-byte-in-loop

keywordBytes = append(keywordBytes, []byte(v))

Multiple appends without pre-allocation; use make() with capacity when size is known

L714

info performance append-without-prealloc

keywordBytes = append(keywordBytes, []byte(v))

Multiple appends without pre-allocation; use make() with capacity when size is known

L810

info performance append-without-prealloc

names = append(names, key)

Multiple appends without pre-allocation; use make() with capacity when size is known

L840

info performance append-without-prealloc

DirFilePaths = append(DirFilePaths, ".")

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L855

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L862

info correctness fmt-println

fmt.Println("--hotspots is mutually exclusive with --by-author / --timeline; pick one report")

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L868

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L875

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L883

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L891

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L899

info correctness fmt-println

fmt.Println(err)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L914

info correctness fmt-println

fmt.Println("file or directory could not be read: " + fpath)

Multiple appends without pre-allocation; use make() with capacity when size is known

L919

info performance append-without-prealloc

dirPaths = append(dirPaths, fpath)

Multiple appends without pre-allocation; use make() with capacity when size is known

L956

info performance append-without-prealloc

fileWalker.ExcludeFilenameRegex = append(fileWalker.ExcludeFilenameRegex, regexpResult)

Multiple appends without pre-allocation; use make() with capacity when size is known

L957

info performance append-without-prealloc

fileWalker.ExcludeDirectoryRegex = append(fileWalker.ExcludeDirectoryRegex, regexpResult)

Multiple appends without pre-allocation; use make() with capacity when size is known

L958

info performance append-without-prealloc

excludePathRegexes = append(excludePathRegexes, regexpResult)

Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)

L1018

info correctness fmt-println

fmt.Println("results written to " + FileOutput)

Code quality findings 48

Get this view in your editor