2017-03-29 22:18:01 +01:00
|
|
|
import shutil
|
|
|
|
import os
|
2017-04-01 16:12:03 +01:00
|
|
|
import logging
|
2017-05-25 20:34:41 +01:00
|
|
|
from bs4 import BeautifulSoup
|
2017-06-09 23:07:30 +01:00
|
|
|
from typing import List
|
|
|
|
|
2017-04-01 16:12:03 +01:00
|
|
|
|
|
|
|
logger = logging.getLogger(__file__)
|
2017-03-29 22:18:01 +01:00
|
|
|
|
|
|
|
|
2017-06-09 23:07:30 +01:00
|
|
|
def remove_dir(dir: str):
|
2017-04-01 16:12:03 +01:00
|
|
|
logger.debug("Removing directory {}.".format(dir))
|
2017-03-29 22:18:01 +01:00
|
|
|
try:
|
|
|
|
shutil.rmtree(dir)
|
|
|
|
os.rmdir(dir)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
2017-04-04 21:44:17 +01:00
|
|
|
|
|
|
|
|
2017-06-09 23:07:30 +01:00
|
|
|
def safe_list_get(l: List, idx: int, default):
|
2017-04-04 21:44:17 +01:00
|
|
|
try:
|
|
|
|
return l[idx]
|
|
|
|
except IndexError:
|
|
|
|
return default
|
2017-05-25 20:34:41 +01:00
|
|
|
|
|
|
|
|
2017-06-09 23:07:30 +01:00
|
|
|
def get_plain_text(content: str) -> str:
|
2017-05-25 20:34:41 +01:00
|
|
|
soup = BeautifulSoup(content, 'html.parser')
|
|
|
|
body = soup.find('body')
|
2017-05-28 16:58:58 +01:00
|
|
|
if body is None:
|
|
|
|
return content
|
2017-05-25 20:34:41 +01:00
|
|
|
try:
|
|
|
|
body.find('h1', class_='references-title').extract()
|
|
|
|
body.find('div', class_='references').extract()
|
|
|
|
except AttributeError:
|
|
|
|
pass
|
|
|
|
return body.text
|