processor/detector.go GO 223 lines View on github.com → Search inside
1// SPDX-License-Identifier: MIT23package processor45import (6	"bytes"7	"cmp"8	"errors"9	"slices"10	"strings"11)1213var (14	errMissingShebang              = errors.New("missing shebang")15	errUnknownShebang              = errors.New("unknown shebang")16	errUnableToDetermineShebangCmd = errors.New("unable to determine shebang command")17)1819// DetectLanguage detects a language based on the filename returns the language extension and error20func DetectLanguage(name string) ([]string, string) {21	extension := ""2223	if len(AllowListExtensions) == 0 {24		// Check the full name for special languages such as xmake.lua, meson.build, ...25		lang, ok := FilenameToLanguage[strings.ToLower(name)]26		if ok {27			return []string{lang}, name28		}2930		t := strings.Count(name, ".")31		// If there is no . in the filename or it starts with one then check if #!32		if t == 0 || (name[0] == '.' && t == 1) {33			printWarnF("possible #! file: %s", name)3435			// No extension indicates possible #! so mark as such for processing36			return []string{SheBang}, name37		}38	}3940	// Lookup in case the full name matches41	language, ok := ExtensionToLanguage[strings.ToLower(name)]4243	// If no match check if we have a matching extension44	if !ok {45		extension = getExtension(name)46		language, ok = ExtensionToLanguage[extension]47	}4849	// Convert from d.ts to ts and check that in case of multiple extensions50	if !ok {51		extension = getExtension(extension)52		language = ExtensionToLanguage[extension]53	}5455	return language, extension56}5758// DetectSheBang given some content attempt to determine if it has a #! that maps to a known language and return the language59func DetectSheBang(content []byte) (string, error) {60	if !bytes.HasPrefix(content, []byte("#!")) {61		return "", errMissingShebang62	}6364	content, _, _ = bytes.Cut(content, []byte{'\n'})6566	cmd, err := scanForSheBang(content)67	if err != nil {68		return "", err69	}7071	for k, v := range ShebangLookup {72		if slices.Contains(v, cmd) {73			// detects both full path and env usage74			return k, nil75		}76	}7778	return "", errUnknownShebang79}8081func scanForSheBang(content []byte) (string, error) {82	state := 083	lastSlash := 08485	candidate1 := ""86	candidate2 := ""8788loop:89	for i := range content {90		switch state {91		case 0: // Deals with whitespace after #! and before first /92			if content[i] == '/' {93				lastSlash = i94				state = 195			}96		case 1: // Once we found the first / keep going till we hit whitespace97			if content[i] == '/' {98				lastSlash = i99			}100101			// when at the end pull out the candidate102			if i == len(content)-1 {103				candidate1 = string(content[lastSlash+1 : i+1])104			}105106			// between last slash and here is the first candidate which is either env or Perl/PHP/Python etc..107			if isWhitespace(content[i]) {108				// mark from lastSlash to here as first argument109				candidate1 = string(content[lastSlash+1 : i])110				state = 2111			}112		case 2: // We have the first candidate, see if there is another113			// go till end of whitespace, mark that spot as new start114			if !isWhitespace(content[i]) {115				lastSlash = i116				state = 3117			}118		case 3:119			if i == len(content)-1 {120				candidate2 = string(content[lastSlash : i+1])121			}122123			if isWhitespace(content[i]) {124				candidate2 = string(content[lastSlash:i])125				state = 4126			}127		case 4:128			break loop129		}130	}131132	switch {133	case candidate1 == "env":134		return candidate2, nil135	case candidate1 != "":136		return candidate1, nil137	}138139	return "", errUnableToDetermineShebangCmd140}141142type languageGuess struct {143	Name  string144	Count int145}146147// DetermineLanguage given a filename, fallback language, possible languages and content make a guess to the type.148// If multiple possible it will guess based on keywords similar to how https://github.com/vmchale/polyglot does149func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguages []string, content []byte) string {150	// If being called through an API it's possible nothing is set here and as151	// such should just return as the Language value should have already been set152	if len(possibleLanguages) == 0 {153		return fallbackLanguage154	}155156	// There should only be two possibilities now, either we have a single fallbackLanguage157	// in which case we set it and return158	// or we have multiple in which case we try to determine it heuristically159	if len(possibleLanguages) == 1 {160		return possibleLanguages[0]161	}162163	startTime := makeTimestampNano()164165	toCheck := content166	if len(content) > 20_000 {167		toCheck = content[:20_000]168	}169170	primary := ""171172	toSort := make([]languageGuess, 0, len(possibleLanguages))173	for _, lan := range possibleLanguages {174		LanguageFeaturesMutex.Lock()175		langFeatures := LanguageFeatures[lan]176		LanguageFeaturesMutex.Unlock()177178		count := 0179		for _, key := range langFeatures.KeywordBytes {180			if bytes.Contains(toCheck, key) {181				count++182			}183		}184185		// if no features are found that means that this one is considered the primary186		// and as such the default fallback if we don't find a suitable number of matching187		// keywords188		// consider YAML files for example, where cloudformation files can also be YAML189		// YAML can have any form so it's not possible to say "this is a yaml file"190		// so we can only say "this is likely to be a cloudformation file", and as such191		// we need to handle a fallback case, which in this case is nothing192		if len(langFeatures.Keywords) == 0 {193			primary = lan194		}195196		toSort = append(toSort, languageGuess{Name: lan, Count: count})197	}198199	slices.SortFunc(toSort, func(a, b languageGuess) int {200		if order := cmp.Compare(b.Count, a.Count); order != 0 {201			return order202		}203		return strings.Compare(a.Name, b.Name)204	})205206	if primary != "" && len(toSort) != 0 {207		// OK at this point we have a primary, which means we want 3 or more matches to count as something else208		if toSort[0].Count < 3 {209			// we didn't find enough results, so lets return the primary in this case210			return primary211		}212	}213214	printWarnF("guessing language %s for file %s", toSort[0].Name, filename)215	printTraceF("nanoseconds to guess language: %s: %d", filename, makeTimestampNano()-startTime)216217	if len(toSort) != 0 {218		return toSort[0].Name219	}220221	return fallbackLanguage222}

Code quality findings 3

Blank identifier discarding results; verify intentional ignoring of return values
warning correctness blank-identifier-discard
content, _, _ = bytes.Cut(content, []byte{'\n'})
Range over slice copies each element by value; use index or pointer receiver for large structs to avoid copies
info performance copy-large-struct
for k, v := range ShebangLookup {
Multiple appends without pre-allocation; use make() with capacity when size is known
info performance append-without-prealloc
toSort = append(toSort, languageGuess{Name: lan, Count: count})

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.