forked from lcnetdev/lds-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbf2m-bfdb-batch2.py
More file actions
executable file
·121 lines (103 loc) · 3.56 KB
/
bf2m-bfdb-batch2.py
File metadata and controls
executable file
·121 lines (103 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
####!/marklogic/data/ntra/fmenv/bin/python3
#####!/usr/bin/python fails on from modules
###### !/usr/bin/env python: fails on lxml
######!/marklogic/data/kefo/work/fmenv/bin/python3
#works on atom feed url of instances loaded (instance Ids in bfdb) works on a curl of an instance id: c0213880820001
# https://preprod-8230.id.loc.gov/resources/instances/c0213880820001.marc-pkg.xml
#
import glob
import sys
from lxml import etree as ET
from lxml.builder import ElementMaker
import os
import shutil
import multiprocessing
import subprocess
import urllib3
#import urllib2
import argparse
import feedparser
import yaml
from datetime import date, timedelta
from modules.helpers import get_config
from modules.config_parser import args
config=get_config(args)
#from modules.helpers import get_config
# *** Main Program *****
print("*** BF to MARC testing tool ***")
print("*** Converts the latest feed to MARC ***")
####################
yesterday = date.today() - timedelta(days=1)
yesterday=yesterday.strftime('%Y-%m-%d')
config = yaml.safe_load(open(args.config))
#config=get_config(args)
job=args.job
jobconfig = config[job]
indir=jobconfig["source_directory"]
outdir=jobconfig["source_directory"]
feed=jobconfig["feed"]
print()
print("Config:")
#print(config)
print()
print("Job config:")
print(jobconfig)
print ("yesterday is ", yesterday)
print()
print ("feed url is ", jobconfig["feed"])
print (indir)
print(outdir)
print('*** results #in out/bf', yesterday,'.xml ***')
#infile='bf-ids.txt'
#print("this works on processing server but skip for now:")
http = urllib3.PoolManager()
feedurl=feed.replace('%YESTERDAY%',yesterday)
#infile = http.request('GET',"https://preprod-8231.id.loc.gov/resources/bfedits/2021-01-08/feed/1")
print ("feed url final is ", feedurl)
infile= http.request('GET',feedurl)
#infile="today.xml"
curl=jobconfig["curl"]
#curl = "curl -L 'https://preprod-8230.id.loc.gov/resources/instances/%BIBID%.marc-pkg.xml' > in/%OUTFILE%.rdf"
outfile = outdir + 'mrc.xml'
efilename= outdir + '/error.txt'
#atom= ET.parse(bytes(infile)
atom=ET.XML(infile)
bf2marc=ET.parse("/marklogic/applications/bibframe2marc/bibframe2marc.xsl")
bf2marcxsl=ET.XSLT(bf2marc)
count=0
# curl -L http://preprod-8230.id.loc.gov/resources/instances/feed/22 > today.xml
for entry in atom.iterfind('.//{http://www.w3.org/2005/Atom}entry'):
for id in entry:
if( id.tag=="{http://www.w3.org/2005/Atom}id" and "instances" in id.text ):
bibid= id.text.rsplit("/")[4]
curlcmd = curl.replace('%BIBID%', bibid)
curlcmd = curlcmd.replace('%OUTFILE%', bibid)
returned_value = subprocess.Popen(curlcmd, shell=True).wait()
bibfiles=list(glob.glob(indir+'*.rdf'))
counter = 0
# create output marcxml:collection:
M= ElementMaker(namespace="http://www.loc.gov/MARC21/slim" ,
nsmap={"marc":"http://www.loc.gov/MARC21/slim"})
coll=M.collection()
with open(outfile,'wb') as out:
for file in bibfiles:
counter+=1
if counter % 100 == 0:
print(counter,'/',len(bibfiles))
print ("converting to marc: "+file)
bftree = ET.parse(file)
bfroot = bftree.getroot()
# result has marc
try:
result=bf2marcxsl(bfroot)
except:
print("Unexpected error:", sys.exc_info()[0], sys.exc_info()[1] )
for info in sys.exc_info():
print(info)
record= ET.XML(bytes(result))
coll.insert(counter,record)
out.write(ET.tostring(coll))
out.close
#os.system("cat out/mrc.xml")
#print(glob.glob("out/*xml"))