home..
Web Crawler
David Alade /
webcrawler
python
The following is a python web crawler that traverses a website called “fakebook”. The program implements the HTTP protocol to log in to the site and get pages from it. The goal of this crawler was to look for secret flags that are hidden throughout the site within the HTML code.
Code
#!/usr/bin/env python3
import argparse
import copy
import socket
import ssl
from html.parser import HTMLParser
from _socket import timeout
DEFAULT_SERVER = "proj5.3700.network"
DEFAULT_PORT = 443
frontier = []
visited = []
first_run = []
flags = []
token = ''
logged_in = False
crawling = False
sessionid = None
cookie = None
# This is the class where we set up the variables and methods needed in order for our crawler to function as intended.
class Crawler:
def __init__(self, args):
self.server = args.server
self.port = args.port
self.username = args.username
self.password = args.password
self.session_cookie = None
self.all_flags = False
# This method sends our initial message, logs into the fakebook, and then begins ti crawl through the site in order
# to find all the flags.
def run(self):
self.send_get_message("/", "")
home_page_dict = self.login()
self.crawl(home_page_dict)
# This method sends the GET request to the server, adds whichever link it used for the request to our array of
# visited links, and then returns the data that we receive.
def send_get_message(self, path, cookie):
request = "GET " + path + " HTTP/1.1\n" + "Host: " + DEFAULT_SERVER + "\n" + cookie + '\r\n\r\n'
context = ssl.create_default_context()
mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysocket = context.wrap_socket(mysocket, server_hostname=self.server)
mysocket.connect((self.server, self.port))
mysocket.send(request.encode('ascii'))
visited.append(path)
return self.recv_data(mysocket)
# This method handles the data that we receive and checks it to determine what our crawler needs to do with the
# links it is being given.
def recv_data(self, mysocket):
global sessionid
global cookie
mysocket.settimeout(.3)
data_list = []
# This try-except ensures that we receive the data we need and times out if the server is taking too long to
# respond.
try:
data = mysocket.recv(10000000).decode('ascii')
data_list = data.split("\r\n\r\n\n\n\n")
except timeout:
mysocket.shutdown(1)
mysocket.close()
# Handles how we receive data when we are crawling through the website
if crawling:
if len(data_list) == 1:
data_dict = self.handle_http(data_list[0])
if "503" in data_list[0]:
self.send_get_message(frontier[1], cookie)
return
if "403" in data_list[0] or "404" in data_list[0]:
frontier.remove(frontier[1])
return
if len(data_list) == 2:
self.handle_html(data_list[1])
return data_list[1]
# These else-if statements handle the data for when we are both logged into the site and not logged in.
elif len(data_list) == 1 and not logged_in:
data_dict = self.handle_http(data_list[0])
return self.send_get_message(data_dict['Location'], "")
elif len(data_list) == 1 and logged_in:
data_dict = self.handle_http(data_list[0])
cookie = "Cookie: csrftoken=" + data_dict['Set-Cookie'] + "; sessionid=" + data_dict['sessionid']
sessionid = data_dict['sessionid']
return self.send_get_message(data_dict['Location'], cookie)
elif len(data_list) == 2:
http_dict = self.handle_http(data_list[0])
self.handle_html(data_list[1])
return http_dict
# Handles the html part of the response
@staticmethod
def handle_html(data):
html_parser = MyHTMLParser()
html_parser.feed(data)
# Handles the http part of the response
@staticmethod
def handle_http(data):
response_dict = {}
split_response = data.splitlines()
# Splits the response by line and turns it into a dictionary
for line in split_response:
if ":" in line:
parts = line.split(":")
if parts[0] not in response_dict:
response_dict[parts[0]] = parts[1].strip()
else:
response_dict['sessionid'] = parts[1].strip().split(";")[0].replace("sessionid=", "")
if not logged_in:
response_dict["Set-Cookie"] = response_dict['Set-Cookie'].split(";")[0].replace("csrftoken=", "")
return response_dict
# Logs us in to fakebook by sending the POST message
def login(self):
global logged_in
global crawling
http_dict = self.send_get_message(frontier[0], '')
login_info = "username=%s&password=%s&csrfmiddlewaretoken=%s" % (self.username, self.password, token)
self.session_cookie = http_dict['Set-Cookie']
# Creates the format of the post message to send to the server in order to login
post = "POST %s HTTP/1.1\r\nHost: %s\r\nConnection: close" % ('/accounts/login/?next=/fakebook/', self.server)
content_type = "Content-Type: application/x-www-form-urlencoded"
content_length = "Content-Length: " + str(len(login_info))
cookie = "Cookie: " + "sessionid=" + http_dict['sessionid'] + "; csrftoken=" + http_dict['Set-Cookie']
msg = post + "\r\n" + content_length + "\r\n" + content_type + "\r\n" + cookie + "\r\n\r\n" + login_info + '\r\n\r\n'
context = ssl.create_default_context()
mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysocket = context.wrap_socket(mysocket, server_hostname=self.server)
mysocket.connect((self.server, self.port))
mysocket.send(msg.encode('ascii'))
logged_in = True
post_login = self.recv_data(mysocket)
visited.append("/fakebook/")
crawling = True
return post_login
# Crawls through the website until we find 5 flags.
def crawl(self, home_page):
global cookie
cookie = "Cookie: " + home_page['Set-Cookie'] + "; sessionid=" + sessionid
loop_frontier = copy.deepcopy(frontier)
for link in loop_frontier:
if link not in visited:
recv = self.send_get_message(link, cookie)
frontier.remove(link)
visited.append(link)
# Continues to loop through the links until we have found all the flags needed
while len(flags) < 5:
loop_frontier = copy.deepcopy(frontier)
for link in loop_frontier:
if link not in visited:
if len(flags) == 5:
return
frontier.remove(link)
recv = self.send_get_message(link, cookie)
visited.append(link)
# Implements the HTML parser class, so we can parse HTML responses from fakebook
class MyHTMLParser(HTMLParser):
# Handles the start tag of an HTML response to check if it is a link or not
def handle_starttag(self, tag, attrs):
global token
# Checks if we are on the "first run" of the site and looks for a specific tag
if len(first_run) == 0 and tag == 'a':
if attrs[0][1] == "/fakebook/":
frontier.append(attrs[0][1])
first_run.append('LOL')
return
elif len(first_run) != 0 and tag == 'a':
domain = attrs[0][1].split('/')
if len(domain) > 1:
domain = domain[1]
if tag == 'a' and domain == 'fakebook':
if attrs[0][1] not in visited:
frontier.append(attrs[0][1])
if tag == 'h2':
flags.append(attrs)
if tag == "input":
yes = False
for attr in attrs:
if attr[1] == 'csrfmiddlewaretoken':
yes = True
if yes and attr[0] == 'value':
token = attr[1]
def handle_endtag(self, tag):
pass
# Prints the flags when found in the HTML data responses
def handle_data(self, data):
global flags
if 'FLAG' in data:
flags.append(data)
flag = data.split(":")[1].strip()
print(flag)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='crawl Fakebook')
parser.add_argument('-s', dest="server", type=str, default=DEFAULT_SERVER, help="The server to crawl")
parser.add_argument('-p', dest="port", type=int, default=DEFAULT_PORT, help="The port to use")
parser.add_argument('username', type=str, help="The username to use")
parser.add_argument('password', type=str, help="The password to use")
args = parser.parse_args()
sender = Crawler(args)
sender.run()
Theme Moonwalk