但我认为,肯定有比这更好的东西,所以我转而使用正则表达式,或者更具体地说 Python 的 re 模块。 这个新脚本的相关部分如下所示:
1 2 3 4 5
match = re.findall(r'src="(.*)/>', all_text) iflen(match)>0: for m inmatch: imagelist.append(m)
它的一小部分输出如下所示:
1 2
images/cmcanvas.png" title="Context Menu for the document canvas" alt="Context Menu for the document canvas" /></td></tr></table><br images/eps-imp1.png" title="EPS preview in a file dialog" alt="EPS preview in a file dialog" images/eps-imp5.png" title="Colors imported from an EPSfile" alt="Colors imported from an EPSfile" images/eps-imp4.png" title="EPS font substitution" alt="EPS font substitution" images/eps-imp2.png" title="EPS import progress" alt="EPS import progress" images/eps-imp3.png" title="Bitmap conversion failure" alt="Bitmap conversion failure"
index = 3 whileindex < linelength: if (all_text[index] == '='): if (all_text[index-3] == 's') and (all_text[index-2] == 'r') and (all_text[index-1] == 'c'): imagefound(all_text, imagelist, index) index += 1 else: index += 1 else: index += 1
def imagefound(all_text, imagelist, index): end = 0 index += 2 newimage = '' while end == 0: if (all_text[index] != '"'): newimage = newimage + all_text[index] index += 1 else: newimage = newimage + '\n' imagelist.append(newimage) end = 1 return
htmlnames = [] imagelist = [] tempstring = '' filenames = os.listdir('/home/gregp/development/Scribus15x/doc/en/') for name in filenames: if name.endswith('.html'): htmlnames.append(name) #print htmlnames for htmlfile in htmlnames: all_text = open('/home/gregp/development/Scribus15x/doc/en/' + htmlfile).read() linelength = len(all_text) index = 3 whileindex < linelength: if (all_text[index] == '='): if (all_text[index-3] == 's') and (all_text[index-2] == 'r') and (all_text[index-1] == 'c'): imagefound(all_text, imagelist, index) index += 1 else: index += 1 else: index += 1
outfile = open('/tmp/imagelist_parse4.txt', 'w') outfile.writelines(imagelist) outfile.close() imageno = len(imagelist) print str(imageno) + " images were found and saved"