-
Notifications
You must be signed in to change notification settings - Fork 483
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
24 changed files
with
4,516 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
module crawlergo | ||
|
||
go 1.12 | ||
|
||
replace git.apache.org/thrift.git => github.com/apache/thrift v0.13.0 | ||
|
||
require ( | ||
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 | ||
github.com/chromedp/chromedp v0.5.2 | ||
github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect | ||
github.com/deckarep/golang-set v1.7.1 | ||
github.com/gogf/gf v1.16.6 | ||
github.com/panjf2000/ants/v2 v2.2.2 | ||
github.com/pkg/errors v0.8.1 | ||
github.com/sirupsen/logrus v1.4.2 | ||
github.com/urfave/cli/v2 v2.0.0 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
package config | ||
|
||
import "time" | ||
|
||
const ( | ||
DefaultUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36" | ||
MaxTabsCount = 10 | ||
TabRunTimeout = 20 * time.Second | ||
DefaultInputText = "Crawlergo" | ||
FormInputKeyword = "Crawlergo" | ||
SuspectURLRegex = `(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')` | ||
URLRegex = `((https?|ftp|file):)?//[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]` | ||
AttrURLRegex = `` | ||
DomContentLoadedTimeout = 5 * time.Second | ||
EventTriggerInterval = 100 * time.Millisecond // 单位毫秒 | ||
BeforeExitDelay = 1 * time.Second | ||
DefaultEventTriggerMode = EventTriggerAsync | ||
MaxCrawlCount = 200 | ||
) | ||
|
||
// 请求方法 | ||
const ( | ||
GET = "GET" | ||
POST = "POST" | ||
PUT = "PUT" | ||
DELETE = "DELETE" | ||
HEAD = "HEAD" | ||
OPTIONS = "OPTIONS" | ||
) | ||
|
||
// 过滤模式 | ||
const ( | ||
SimpleFilterMode = "simple" | ||
SmartFilterMode = "smart" | ||
StrictFilterMode = "strict" | ||
) | ||
|
||
// 事件触发模式 | ||
const ( | ||
EventTriggerAsync = "async" | ||
EventTriggerSync = "sync" | ||
) | ||
|
||
// 请求的来源 | ||
const ( | ||
FromTarget = "Target" //初始输入的目标 | ||
FromNavigation = "Navigation" //页面导航请求 | ||
FromXHR = "XHR" //ajax异步请求 | ||
FromDOM = "DOM" //dom解析出来的请求 | ||
FromJSFile = "JavaScript" //JS脚本中解析 | ||
FromFuzz = "PathFuzz" //初始path fuzz | ||
FromRobots = "robots.txt" //robots.txt | ||
FromComment = "Comment" //页面中的注释 | ||
FromWebSocket = "WebSocket" | ||
FromEventSource = "EventSource" | ||
FromFetch = "Fetch" | ||
FromHistoryAPI = "HistoryAPI" | ||
FromOpenWindow = "OpenWindow" | ||
FromHashChange = "HashChange" | ||
FromStaticRes = "StaticResource" | ||
FromStaticRegex = "StaticRegex" | ||
) | ||
|
||
// content-type | ||
const ( | ||
JSON = "application/json" | ||
URLENCODED = "application/x-www-form-urlencoded" | ||
MULTIPART = "multipart/form-data" | ||
) | ||
|
||
var StaticSuffix = []string{ | ||
"png", "gif", "jpg", "mp4", "mp3", "mng", "pct", "bmp", "jpeg", "pst", "psp", "ttf", | ||
"tif", "tiff", "ai", "drw", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff", | ||
"dxf", "eps", "ps", "svg", "3gp", "asf", "asx", "avi", "mov", "mpg", "qt", "rm", | ||
"wmv", "m4a", "bin", "xls", "xlsx", "ppt", "pptx", "doc", "docx", "odt", "ods", "odg", | ||
"odp", "exe", "zip", "rar", "tar", "gz", "iso", "rss", "pdf", "txt", "dll", "ico", | ||
"gz2", "apk", "crt", "woff", "map", "woff2", "webp", "less", "dmg", "bz2", "otf", "swf", | ||
"flv", "mpeg", "dat", "xsl", "csv", "cab", "exif", "wps", "m4v", "rmvb", | ||
} | ||
|
||
var ScriptSuffix = []string{ | ||
"php", "asp", "jsp", "asa", | ||
} | ||
|
||
var DefaultIgnoreKeywords = []string{"logout", "quit", "exit"} | ||
var AllowedFormName = []string{"default", "mail", "code", "phone", "username", "password", "qq", "id_card", "url", "date", "number"} | ||
|
||
type ContinueResourceList []string | ||
|
||
var InputTextMap = map[string]map[string]interface{}{ | ||
"mail": { | ||
"keyword": []string{"mail"}, | ||
"value": "[email protected]", | ||
}, | ||
"code": { | ||
"keyword": []string{"yanzhengma", "code", "ver", "captcha"}, | ||
"value": "123a", | ||
}, | ||
"phone": { | ||
"keyword": []string{"phone", "number", "tel", "shouji"}, | ||
"value": "18812345678", | ||
}, | ||
"username": { | ||
"keyword": []string{"name", "user", "id", "login", "account"}, | ||
"value": "[email protected]", | ||
}, | ||
"password": { | ||
"keyword": []string{"pass", "pwd"}, | ||
"value": "Crawlergo6.", | ||
}, | ||
"qq": { | ||
"keyword": []string{"qq", "wechat", "tencent", "weixin"}, | ||
"value": "123456789", | ||
}, | ||
"IDCard": { | ||
"keyword": []string{"card", "shenfen"}, | ||
"value": "511702197409284963", | ||
}, | ||
"url": { | ||
"keyword": []string{"url", "site", "web", "blog", "link"}, | ||
"value": "https://crawlergo.nice.cn/", | ||
}, | ||
"date": { | ||
"keyword": []string{"date", "time", "year", "now"}, | ||
"value": "2018-01-01", | ||
}, | ||
"number": { | ||
"keyword": []string{"day", "age", "num", "count"}, | ||
"value": "10", | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package pkg | ||
|
||
import ( | ||
"crawlergo/pkg/model" | ||
mapset "github.com/deckarep/golang-set" | ||
"strings" | ||
) | ||
|
||
func SubDomainCollect(reqList []*model.Request, HostLimit string) []string { | ||
var subDomainList []string | ||
uniqueSet := mapset.NewSet() | ||
for _, req := range reqList { | ||
domain := req.URL.Hostname() | ||
if uniqueSet.Contains(domain) { | ||
continue | ||
} | ||
uniqueSet.Add(domain) | ||
if strings.HasSuffix(domain, "."+HostLimit) { | ||
subDomainList = append(subDomainList, domain) | ||
} | ||
} | ||
return subDomainList | ||
} | ||
|
||
func AllDomainCollect(reqList []*model.Request) []string { | ||
uniqueSet := mapset.NewSet() | ||
var allDomainList []string | ||
for _, req := range reqList { | ||
domain := req.URL.Hostname() | ||
if uniqueSet.Contains(domain) { | ||
continue | ||
} | ||
uniqueSet.Add(domain) | ||
allDomainList = append(allDomainList, req.URL.Hostname()) | ||
} | ||
return allDomainList | ||
} |
Oops, something went wrong.