-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathai.py
61 lines (50 loc) · 2.22 KB
/
ai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Reads /data/data.cvs and uses logistic regression to classify malicious URLs
Original Code: http://fsecurify.com/using-machine-learning-detect-malicious-urls/
It has been slightly modified
"""
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def getTokens(input):
# get tokens after splitting by slash
tokensBySlash = str(input.encode('utf-8')).split('/')
allTokens = []
for i in tokensBySlash:
tokens = str(i).split('-') # get tokens after splitting by dash
tokensByDot = []
for j in range(0, len(tokens)):
# get tokens after splitting by dot
tempTokens = str(tokens[j]).split('.')
tokensByDot = tokensByDot + tempTokens
allTokens = allTokens + tokens + tokensByDot
allTokens = list(set(allTokens)) # remove redundant tokens
if 'com' in allTokens:
# removing .com since it occurs a lot of times and it should not be
# included in our features
allTokens.remove('com')
return allTokens
def TL():
allurls = './data/data.csv' # path to our all urls file
allurlscsv = pd.read_csv(
allurls, ',', error_bad_lines=False) # reading file
allurlsdata = pd.DataFrame(allurlscsv) # converting to a dataframe
allurlsdata = np.array(allurlsdata) # converting it into an array
random.shuffle(allurlsdata) # shuffling
y = [d[1] for d in allurlsdata] # all labels
# all urls corresponding to a label (either good or bad)
corpus = [d[0] for d in allurlsdata]
# get a vector for each url but use our customized tokenizer
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(corpus) # get the X vector
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio
lgs = LogisticRegression() # using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test)) # pring the score. It comes out to be 98%
return vectorizer, lgs