PaperScraper/GetPapers.py at master · sgrieve/PaperScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
import slate
import os
import sys
import string


def LoadToken(FileName):
    '''
    Grab crossref click through api token token from a file.
    '''
    with open(FileName, 'r') as f:
        return f.readline().strip()


def doi2url(doi):
    '''
    Convert a doi into a url. No testing of validity of doi.
    '''
    return ('http://dx.doi.org/{0}').format(doi)


def doiFormat(doi):
    '''
    Remove slashes from a doi.
    '''
    return doi.replace('/', '_')


def doiUnformat(doi):
    '''
    Reinsert slashes into a doi.
    '''
    return doi.replace('_', '/')


def DownloadPaper(doi, Token, Outpath):
    '''
    Only tested on ESPL at present. So gets pdf. Will need to modify code for
    xml journals.
    '''

    url = doi2url(doi)

    # http header for crossref metadata request
    header = {'Accept': 'application/vnd.crossref.unixsd+xml'}

    # get the fulltext link
    r = requests.get(url, headers=header)
    paperURL = r.links['item']['url']

    # Add user's token to the new http header and request the paper
    header = {'CR-Clickthrough-Client-Token': Token}
    fulltext = requests.get(paperURL, headers=header)

    # Build output path + filename, changing / to _ in the doi
    Output = '{0}{1}.pdf'.format(Outpath, doiFormat(doi))

    # write the paper to the supplied path
    with open(Output, 'wb') as f:
        f.write(fulltext.content)


def StripText(text):
    '''
    Remove all punctuation and digits from raw text aside from periods, which
    are needed to identify sentence breaks.
    '''

    TransTable = string.maketrans('', '')
    Replacement = string.punctuation.replace('.', '') + string.digits
    return text.translate(TransTable, Replacement)


def ExtractText(doi, pdfPath, txtPath):
    '''
    Extract raw text from downloaded pdf and save as a txt file.
    '''

    doi = doiFormat(doi)

    with open('{0}{1}.pdf'.format(pdfPath, doi)) as f:
        paper = slate.PDF(f)

    with open('{0}{1}.txt'.format(txtPath, doi), 'w') as w:
        for page in paper:
            w.write(StripText(page))


def Core(dois, pdfPath, txtPath):
    '''
    Pass in list of dois, a path to write pdfs to and a path to write textfiles.
    Does basic error handling and will create paths if they are not found.
    '''

    if not os.path.exists(pdfPath):
        os.mkdir(pdfPath)
    if not os.path.exists(txtPath):
        os.mkdir(txtPath)

    if not os.path.isfile('Token.token'):
        sys.exit('\nToken file cannot be found.\n')

    Token = LoadToken('Token.token')

    for doi in dois:

        DownloadPaper(doi, Token, pdfPath)
        ExtractText(doi, pdfPath, txtPath)

Core(['10.1002/esp.3884'], '/home/sgrieve/StanfordNLP/code/',
     '/home/sgrieve/StanfordNLP/code/')