解决pdfbox中文乱码问题
网上搜的中文解决办法都不管用,花了几天时间,终于自己解决了。
##org.apache.pdfbox.pdmodel.font.PDFont.java
public String encode( byte[] c, int offset, int length ) throws IOException
{
String retval = null;
if( cmap != null )
{
if (length == 1 && cmap.hasOneByteMappings())
{
retval = cmap.lookup( c, offset, length );
}
else if (length == 2 && cmap.hasTwoByteMappings())
{
retval = cmap.lookup( c, offset, length );
}
}
// there is no cmap but probably an encoding with a suitable mapping
if( retval == null )
{
Encoding encoding = getEncoding();
if( encoding != null )
{
retval = encoding.getCharacter( getCodeFromArray( c, offset, length ) );
}
// when the font has a DescendantFonts ,try to decode with it
if(length == 2 && retval == null && this instanceof PDType0Font)
{
COSArray descendantFontArray =(COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS );
if( descendantFontArray != null && descendantFontArray.size() != 0)
{
COSDictionary descendantFontDictionary = (COSDictionary)descendantFontArray.getObject( 0 );
PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary );
CMap cmp = descendentFont.cmap ;
String cmpName = cmp.getRegistry()+"-"+cmp.getOrdering()+"-UCS2";
CMap cmp2 = cmapObjects.get(cmpName);
if(cmp2 == null && threadLocale.get() == null)
{
parseCmap( resourceRootCMAP, getClass().getResourceAsStream(cmpName), COSName.getPDFName(cmpName) );
threadLocale.set(true);
cmp2 = cmapObjects.get(cmpName);
if(cmp2 == null)
{
log.warn("Can't parse the CMap for " + cmpName);
}
}
if(cmp2 != null)
{
retval = cmp2.lookup(c, offset, length);
}
}
}
if( retval == null && (cmap == null || length == 2))
{
retval = getStringFromArray( c, offset, length );
}
}
return retval;
}