processor/detector.go GO 232 lines View on github.com → Search inside
1// SPDX-License-Identifier: MIT23package processor45import (6	"bytes"7	"cmp"8	"errors"9	"slices"10	"strings"11)1213var (14	errMissingShebang              = errors.New("missing shebang")15	errUnknownShebang              = errors.New("unknown shebang")16	errUnableToDetermineShebangCmd = errors.New("unable to determine shebang command")17)1819// DetectLanguage detects a language based on the filename returns the language extension and error20func DetectLanguage(name string) ([]string, string) {21	extension := ""2223	if len(AllowListExtensions) == 0 {24		// Check the full name for special languages such as xmake.lua, meson.build, ...25		lang, ok := FilenameToLanguage[strings.ToLower(name)]26		if ok {27			return []string{lang}, name28		}2930		t := strings.Count(name, ".")31		// If there is no . in the filename or it starts with one then check if #!32		if t == 0 || (name[0] == '.' && t == 1) {33			printWarnF("possible #! file: %s", name)3435			// No extension indicates possible #! so mark as such for processing36			return []string{SheBang}, name37		}38	}3940	// Lookup in case the full name matches41	language, ok := ExtensionToLanguage[strings.ToLower(name)]4243	// If no match check if we have a matching extension44	if !ok {45		extension = getExtension(name)46		language, ok = ExtensionToLanguage[extension]47	}4849	// Convert from d.ts to ts and check that in case of multiple extensions50	if !ok {51		extension = getExtension(extension)52		language = ExtensionToLanguage[extension]53	}5455	return language, extension56}5758// DetectSheBang given some content attempt to determine if it has a #! that maps to a known language and return the language59func DetectSheBang(content string) (string, error) {60	if !strings.HasPrefix(content, "#!") {61		return "", errMissingShebang62	}6364	index := strings.Index(content, "\n")6566	if index != -1 {67		content = content[:index]68	}6970	cmd, err := scanForSheBang([]byte(content))7172	if err != nil {73		return "", err74	}7576	for k, v := range ShebangLookup {77		if slices.Contains(v, cmd) {78			// detects both full path and env usage79			return k, nil80		}81	}8283	return "", errUnknownShebang84}8586func scanForSheBang(content []byte) (string, error) {87	state := 088	lastSlash := 08990	candidate1 := ""91	candidate2 := ""9293loop:94	for i := range content {95		switch state {96		case 0: // Deals with whitespace after #! and before first /97			if content[i] == '/' {98				lastSlash = i99				state = 1100			}101		case 1: // Once we found the first / keep going till we hit whitespace102			if content[i] == '/' {103				lastSlash = i104			}105106			// when at the end pull out the candidate107			if i == len(content)-1 {108				candidate1 = string(content[lastSlash+1 : i+1])109			}110111			// between last slash and here is the first candidate which is either env or Perl/PHP/Python etc..112			if isWhitespace(content[i]) {113				// mark from lastSlash to here as first argument114				candidate1 = string(content[lastSlash+1 : i])115				state = 2116			}117		case 2: // We have the first candidate, see if there is another118			// go till end of whitespace, mark that spot as new start119			if !isWhitespace(content[i]) {120				lastSlash = i121				state = 3122			}123		case 3:124			if i == len(content)-1 {125				candidate2 = string(content[lastSlash : i+1])126			}127128			if isWhitespace(content[i]) {129				candidate2 = string(content[lastSlash:i])130				state = 4131			}132		case 4:133			break loop134		}135	}136137	switch {138	case candidate1 == "env":139		return candidate2, nil140	case candidate1 != "":141		return candidate1, nil142	}143144	return "", errUnableToDetermineShebangCmd145}146147type languageGuess struct {148	Name  string149	Count int150}151152// DetermineLanguage given a filename, fallback language, possible languages and content make a guess to the type.153// If multiple possible it will guess based on keywords similar to how https://github.com/vmchale/polyglot does154func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguages []string, content []byte) string {155	// If being called through an API it's possible nothing is set here and as156	// such should just return as the Language value should have already been set157	if len(possibleLanguages) == 0 {158		return fallbackLanguage159	}160161	// There should only be two possibilities now, either we have a single fallbackLanguage162	// in which case we set it and return163	// or we have multiple in which case we try to determine it heuristically164	if len(possibleLanguages) == 1 {165		return possibleLanguages[0]166	}167168	startTime := makeTimestampNano()169170	toCheck := content171	if len(content) > 20_000 {172		toCheck = content[:20_000]173	}174175	primary := ""176177	toSort := make([]languageGuess, 0, len(possibleLanguages))178	for _, lan := range possibleLanguages {179		LanguageFeaturesMutex.Lock()180		langFeatures := LanguageFeatures[lan]181		LanguageFeaturesMutex.Unlock()182183		count := 0184		for _, key := range langFeatures.KeywordBytes {185			if bytes.Contains(toCheck, key) {186				count++187			}188		}189190		// if no features are found that means that this one is considered the primary191		// and as such the default fallback if we don't find a suitable number of matching192		// keywords193		// consider YAML files for example, where cloudformation files can also be YAML194		// YAML can have any form so it's not possible to say "this is a yaml file"195		// so we can only say "this is likely to be a cloudformation file", and as such196		// we need to handle a fallback case, which in this case is nothing197		if len(langFeatures.Keywords) == 0 {198			primary = lan199		}200201		toSort = append(toSort, languageGuess{Name: lan, Count: count})202	}203204	slices.SortFunc(toSort, func(a, b languageGuess) int {205		if order := cmp.Compare(b.Count, a.Count); order != 0 {206			return order207		}208		return strings.Compare(a.Name, b.Name)209	})210211	// fmt.Println(toSort)212	// fmt.Println(possibleLanguages)213	// fmt.Println(primary, toSort[0].Name, toSort[0].Count)214215	if primary != "" && len(toSort) != 0 {216		// OK at this point we have a primary, which means we want 3 or more matches to count as something else217		if toSort[0].Count < 3 {218			// we didn't find enough results, so lets return the primary in this case219			return primary220		}221	}222223	printWarnF("guessing language %s for file %s", toSort[0].Name, filename)224	printTraceF("nanoseconds to guess language: %s: %d", filename, makeTimestampNano()-startTime)225226	if len(toSort) != 0 {227		return toSort[0].Name228	}229230	return fallbackLanguage231}

Code quality findings 6

String to byte slice conversion inside loop allocates a new slice each iteration; convert once before the loop
info correctness string-to-byte-in-loop
cmd, err := scanForSheBang([]byte(content))
Range over slice copies each element by value; use index or pointer receiver for large structs to avoid copies
info performance copy-large-struct
for k, v := range ShebangLookup {
Multiple appends without pre-allocation; use make() with capacity when size is known
info performance append-without-prealloc
toSort = append(toSort, languageGuess{Name: lan, Count: count})
Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)
info correctness fmt-println
// fmt.Println(toSort)
Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)
info correctness fmt-println
// fmt.Println(possibleLanguages)
Unstructured output; use a structured logging library (e.g., slog, zap, zerolog, logrus)
info correctness fmt-println
// fmt.Println(primary, toSort[0].Name, toSort[0].Count)

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.