如何从一个.txt中寻找目标字符串?

萧乐颜 2008-02-14 03:58:27
我有个URL.txt,保存的是已经得到的html源代码,在不用正则表达式的情况下,如何获取里面的URL以及它所对应的名字,然后保存在另外一个.txt 里面?环境是VS2005下

例如得到的源代码:
<html>
...
<body>
...
<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47">仙魔大战CG</a><br>
<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47">仙侠异界故事</a><br>
<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47">五大门派演示</a><br>
<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=412" target="_blank" class="a47">仙侠异界(序章)</a>
.....
</html>
获取http://ss.mop.com/pub/show.php?tpl_id=202&id=296仙侠异界故事等?
...全文
298 12 打赏 收藏 转发到动态 举报
AI 作业
写回复
用AI写文章
12 条回复
切换为时间正序
请发表友善的回复…
发表回复
萧乐颜 2008-02-15
  • 打赏
  • 举报
回复
我问的是我的代码哈。
呵呵,我还是新手,得多向大家学习学习!
萧乐颜 2008-02-15
  • 打赏
  • 举报
回复
呵呵,这只是整个项目中的一小部分,所以没用你说的那种方式,用另外一种方式实现了!
结帖了!散分!
mLee79 2008-02-15
  • 打赏
  • 举报
回复
太失败了, fprintf( stderr , "warning unexpected CHAR:%x(%c)\n" , *p , *p );
mLee79 2008-02-15
  • 打赏
  • 举报
回复
搞错:

else if( NULL == href )
{
fprintf( stderr , "warning unexpected CHAR:%x(%c)\n" , *href );
p = text;
}


应该是:

else if( NULL == href )
{
fprintf( stderr , "warning unexpected CHAR:%x(%c)\n" , *p );
href = text;
}


mLee79 2008-02-15
  • 打赏
  • 举报
回复
运行效果:

$ cat inp; gcc -o gi gi.c; ./gi inp
<html>
<body>

<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47"> 仙魔大战CG </a> <br>
<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47"> 仙侠异界故事 </a> <br>

<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=296" target="_blank" class="a47"> 五大门派演示 </a> <br>

<a href="http://ss.mop.com/pub/show.php?tpl_id=202&id=412" target="_blank" class="a47"> 仙侠异界(序章) </a>
</body>
</html>


match HREF:"http://ss.mop.com/pub/show.php?tpl_id=202&id=296" , title:" 仙魔大战CG "
match HREF:"http://ss.mop.com/pub/show.php?tpl_id=202&id=296" , title:" 仙侠异界故事 "
match HREF:"http://ss.mop.com/pub/show.php?tpl_id=202&id=296" , title:" 五大门派演示 "
match HREF:"http://ss.mop.com/pub/show.php?tpl_id=202&id=412" , title:" 仙侠异界(序章) "
mLee79 2008-02-15
  • 打赏
  • 举报
回复
为虾米不用正则表达式, 酱紫用正则表达式可以吧:


#include <stdio.h>
#include <ctype.h>

void slove( unsigned char* begin , unsigned char* end )
{
enum { YY_LASTDFA=22, YY_JAMBASE=26 };
static const unsigned char yy_accept[] = {
0, 0, 0, 0, 0, 0, 0, 6, 5, 5,
2, 5, 5, 0, 0, 0, 1, 0, 0, 0,
4, 0, 3, 0
};
static const unsigned char yy_ec[] = {
0, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 3, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4, 5, 6, 1, 1, 7, 1, 1, 1, 8,
9, 1, 10, 1, 1, 1, 1, 1, 1, 1,
1, 1, 11, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 7, 1, 1,
1, 8, 9, 1, 10, 1, 1, 1, 1, 1,
1, 1, 1, 1, 11, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1
};
static const unsigned char yy_meta[] = {
0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1
};
static const unsigned char yy_base[] = {
0, 21, 20, 0, 5, 19, 18, 21, 26, 13,
26, 7, 14, 12, 5, 5, 26, 0, 1, 14,
26, 0, 26, 26, 0
};
static const unsigned char yy_def[] = {
0, 24, 24, 24, 24, 24, 24, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 19, 23, 0, 23
};
static const unsigned char yy_nxt[] = {
0, 8, 23, 23, 23, 23, 10, 20, 23, 19,
11, 10, 18, 17, 16, 11, 21, 15, 14, 22,
13, 23, 12, 12, 9, 9, 7, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23
};
static const unsigned char yy_chk[] = {
0, 24, 0, 0, 0, 0, 3, 18, 0, 17,
3, 4, 15, 14, 13, 4, 19, 12, 11, 19,
9, 7, 6, 5, 2, 1, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23
};

unsigned char* href = NULL , *title = NULL , *text , *p;
int curr , start = 1 , stbk = 0 , yyc , action = 0;
unsigned char* txtbk = NULL ;

while( 1 )
{
curr = start;
text = begin;
do {
if( begin >= end )
return ; /* done */
yyc = yy_ec[*begin];
if( yy_accept[curr] )
stbk = curr , txtbk = begin;
while( yy_chk[yy_base[curr] + yyc] != curr )
{
if( (curr=yy_def[curr]) >= YY_LASTDFA + 2 )
yyc = yy_meta[ yyc ];
}
curr = yy_nxt[ yy_base[curr] + yyc ];
++begin;
} while( yy_base[curr] != YY_JAMBASE );

if( (action = yy_accept[curr] )== 0 )
begin = txtbk , curr = stbk , action = yy_accept[curr];
switch(action)
{
case 1:
start = 3;
break;
case 2:
title = text + 1;
start = 5;
break;
case 3:
text = p = begin;
for( ; p < end ; ++p )
{
if( isspace(*p) )
;
else if( *p == '\'' || *p == '\"' )
{
if( NULL == href )
{
href = p + 1;
}
else
{
*p = 0;
begin = p + 1;
break;
}
}
else if( NULL == href )
{
fprintf( stderr , "warning unexpected CHAR:%x(%c)\n" , *href );
p = text;
}
}
break;
case 4:
*text = 0;
start = 1;
printf( "match HREF:\"%s\" , title:\"%s\"\n" , href?(char*)href:"<nil>" , title?(char*)title:"<nil>" );
href = title = NULL;
break;
case 5:
break;
}
}
}

#define MAXX_TEST_LENGTH (32*1024)

int main( int argc , char* argv[] )
{
int i , e = 0;
unsigned char buff[ MAXX_TEST_LENGTH + 64 ];
for( i = 1; i < argc; ++i )
{
FILE* ifs = fopen( argv[i] , "rb" );
if( NULL != ifs )
{
size_t szr = fread( buff , 1 , MAXX_TEST_LENGTH , ifs );
fclose( ifs );
buff[szr] = buff[szr+1] = 0;
slove( buff , buff + szr );
}
else
{
++e ;
fprintf( stderr , "ERROR: open %s failed\n" , argv[i] );
}
}
return e;
}


萧乐颜 2008-02-15
  • 打赏
  • 举报
回复
// load.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <iostream>
#include <string>
#include <fstream>
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "vbscript.tlh"

using namespace std;
using namespace VBScript_RegExp_55;

#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS
#include <atlbase.h>
#include <atlstr.h>
#include <wininet.h>

#pragma comment(lib, "wininet.lib")

#define BUF_SIZE (1000*1024)

void findURL(const char * source)
{
IRegExp2Ptr regex;
regex.CreateInstance(__uuidof(VBScript_RegExp_55::RegExp), 0, CLSCTX_INPROC_SERVER);
regex->Global = -1;
regex->IgnoreCase = -1;
regex->Multiline = -1;
regex->Pattern = "<a\\s+href=\"(http://[^?<>\"=&;]+)\".*>(.[^<>\"=&]+)</a>";
IMatchCollection2Ptr mc = regex->Execute(_bstr_t(source));
for (int i=0; i<mc->Count; ++i)
{
IMatch2Ptr match = mc->Item[i];
ISubMatchesPtr sm = match->SubMatches;
cout << (const char *)_bstr_t(sm->Item[0]) << " " << (const char *) _bstr_t(sm->Item[1]) << endl;
}
}


// CrackedUrl
class CrackedUrl
{
int m_scheme;
CStringW m_host;
int m_port;
CStringW m_path;
public:
CrackedUrl(LPCWSTR url)
{
URL_COMPONENTS uc = { 0};
uc.dwStructSize = sizeof(uc);

const DWORD BUF_LEN = 256;

WCHAR host[BUF_LEN];
uc.lpszHostName = host;
uc.dwHostNameLength = BUF_LEN;

WCHAR path[BUF_LEN];
uc.lpszUrlPath = path;
uc.dwUrlPathLength = BUF_LEN;

WCHAR extra[BUF_LEN];
uc.lpszExtraInfo = extra;
uc.dwExtraInfoLength = BUF_LEN;

if (!InternetCrackUrl(url, 0, ICU_ESCAPE, &uc))
{
cout<<"Error:InternetCrackUrl failed!";
Sleep(2000);
}

m_scheme = uc.nScheme;
m_host = host;
m_port = uc.nPort;
m_path = path;
}

int GetScheme() const
{
return m_scheme;
}

LPCWSTR GetHostName() const
{
return m_host;
}

int GetPort() const
{
return m_port;
}

LPCWSTR GetPath() const
{
return m_path;
}

static CStringA UrlEncode(const char* p)
{
if (p == 0)
{
return CStringA();
}

CStringA buf;

for (;;)
{
int ch = (BYTE) (*(p++));
if (ch == '\0')
{
break;
}

if (isalnum(ch) || ch == '_' || ch == '-' || ch == '.')
{
buf += (char)ch;
}
else if (ch == ' ')
{
buf += '+';
}

else
{
char c[16];
wsprintfA(c, "%%%02X", ch);
buf += c;
}
}

return buf;
}
};

// CrackedUrl
HINTERNET OpenSession(LPCWSTR userAgent = 0)
{
return InternetOpen(userAgent, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
}

HINTERNET Connect(HINTERNET hSession, LPCWSTR serverAddr, int portNo)
{
return InternetConnect(hSession, serverAddr, portNo, NULL, NULL, INTERNET_SERVICE_HTTP, 0, 0);
}

HINTERNET OpenRequest(HINTERNET hConnect, LPCWSTR verb, LPCWSTR objectName, int scheme)
{
DWORD flags = 0;

if (scheme == INTERNET_SCHEME_HTTPS)
{
flags |= INTERNET_FLAG_SECURE;
}

return HttpOpenRequest(hConnect, verb, objectName, NULL, NULL, NULL, flags, 0);

}

BOOL AddRequestHeaders(HINTERNET hRequest, LPCWSTR header)
{
SIZE_T len = lstrlenW(header);

return HttpAddRequestHeaders(hRequest, header, DWORD(len), HTTP_ADDREQ_FLAG_ADD);

}

BOOL SendRequest(HINTERNET hRequest, const void* body, DWORD size)
{
return HttpSendRequest(hRequest, 0, 0, const_cast<void*>(body), size);
}

BOOL EndRequest(HINTERNET hRequest)
{
// if you use HttpSendRequestEx to send request then use HttpEndRequest in here!
return TRUE;
}

BOOL QueryInfo(HINTERNET hRequest, int queryId, char* szBuf, DWORD* pdwSize)
{
return HttpQueryInfo(hRequest, queryId, szBuf, pdwSize, 0);
}

BOOL ReadData(HINTERNET hRequest, void* buffer, DWORD length, DWORD* cbRead)
{
return InternetReadFile(hRequest, buffer, length, cbRead);
}

void CloseInternetHandle(HINTERNET hInternet)
{
if (hInternet)
{
InternetCloseHandle(hInternet);
}
}

int _tmain()
{
struct ComInit {
ComInit() {
CoInitialize(0);
}
~ComInit() {
CoUninitialize();
}
} comInit;

HINTERNET hSession = 0;
HINTERNET hConnect = 0;
HINTERNET hRequest = 0;
CStringW strHeader(L"Content-type: application/x-www-form-urlencoded\r\n");

// Test data
//Config cfg;
CrackedUrl crackedUrl(L"http://xunyou.com/");
CStringA strPostData("value1=10&value2=14");

// Open session.
hSession = OpenSession(L"HttpPost by xiaoqiandexiaoqian@126.com");
if (hSession == NULL)
{
cout<<"Error:Open session!";
return -1;
}

// Connect.
hConnect = Connect(hSession, crackedUrl.GetHostName(), crackedUrl.GetPort());
if (hConnect == NULL)
{
cout<<"Error:Connect failed!";
return -1;
}

// Open request.
hRequest = OpenRequest(hConnect, L"POST", crackedUrl.GetPath(), crackedUrl.GetScheme());
if (hRequest == NULL)
{
cout<<"Error:OpenRequest failed!";
return -1;
}

// Add request header.
if (!AddRequestHeaders(hRequest, strHeader))
{
cout<<"Error:AddRequestHeaders failed!";
return -1;
}

// Send post data.
if (!SendRequest(hRequest, (const char*)strPostData, strPostData.GetLength()))
{
cout<<"Error:SendRequest failed!";
Sleep(2000);
return -1;
}

// End request
if (!EndRequest(hRequest))
{
cout<<"Error:EndRequest failed!";
Sleep(2000);
return -1;
}

char szBuf[BUF_SIZE];
DWORD dwSize = 0;
szBuf[0] = 0;


// read data.
for (;;)
{
dwSize = BUF_SIZE;
if (ReadData(hRequest, szBuf, dwSize, &dwSize) == FALSE)
{
break;
}

if (dwSize <= 0)
{
break;
}

szBuf[dwSize] = 0;
cout<<szBuf;

}

vector<char> buf;
ifstream ifs("D:\\URL.txt");
ifs.seekg(0, ios_base::end);
size_t size = ifs.tellg();
ifs.seekg(0, ios_base::beg);
buf.resize(size+1);
ifs.read(&buf[0], buf.size()-1);
buf[size] = 0;
ifs.close();
findURL(&buf[0]);
ofstream resultFile;
resultFile.open("D:\\resultFile.txt");
//////想这这写东西把调试显示的消息保存在resultFile.txt或其他什么文件中
resultFile.close();
ifs.close();


CloseInternetHandle(hRequest);
CloseInternetHandle(hConnect);
CloseInternetHandle(hSession);

return 0;
}

Supper_Jerry 2008-02-15
  • 打赏
  • 举报
回复
针对特定的格式,可以写特定的程序搜索。
大前置 2008-02-15
  • 打赏
  • 举报
回复
下面是我自己写的类,编译通过,不知道能否满足你的要求.
#include <iostream>
#include <fstream>
#include <string>
using namespace std;

class SeUrl
{
public:
void ReadFile();
void SeparateUrlAndName(string strSource);
int StrFind(string strSource,string subStr);
bool OpenFile();
bool CloseFile();
private:
ifstream fin;
ofstream fout;
string url;
string name;
};
bool SeUrl::OpenFile()
{
fin.open("url.txt");
if(fin.fail())
return false;
fout.open("save.txt");
if(fout.fail())
return false;
return true;
}
bool SeUrl::CloseFile()
{
fin.close();
fout.close();
return true;
}
int SeUrl::StrFind(string strSource,string subStr)
{
int sLast=-1;
sLast = strSource.find(subStr)+subStr.length();
return sLast;
}
void SeUrl::SeparateUrlAndName(string strSource)
{
int find=0,next=0,temp;
char str_temp[250],r_str[250],n_str[250];
bool flag=false;
find=StrFind(strSource,"<a href=\"");
strcpy(str_temp,strSource.c_str());
if(find-10)
{
while(next<=strSource.length()&&str_temp[find+1]!='"')
{ //分离URL
r_str[next]=str_temp[find];
next++;
find++;
}
fout<<r_str;
temp=next;
next=0;
while((next+temp)<=strSource.length())
{ //分离name
if(str_temp[find]=='>')
{
flag=true;
}
if(str_temp[find+1]=='<')
{
flag=false;
break ;
}
if(flag)
{
n_str[next]=str_temp[find+1];
fout<<n_str[next];
}
next++;
find++;
}
fout<<endl;
}
}
void SeUrl::ReadFile()
{
string str;
if(OpenFile())
{
while(!fin.eof())
{
getline(fin,str);
SeparateUrlAndName(str);
}
}
}
int main()
{
SeUrl u;
u.ReadFile();
getchar();
return 1;
}
baihacker 2008-02-14
  • 打赏
  • 举报
回复
用不用正则式只是把其中IF中的实现变了而已
自己写个函数便是...
baihacker 2008-02-14
  • 打赏
  • 举报
回复
我是用的正则式...

FLAG = 0
WHILE GET_NEXT_LINE to LINE
IF FLAG = 0
IF LINE 满足开始条件
FLAG = 1
END
ELSE
IF LINE 满足结束条件
BREAK
END
IF 匹配
RESULT.INSERT 匹配结果
END
END
END WHILE
visame 2008-02-14
  • 打赏
  • 举报
回复
逐行读入,然后搜索关键词?

65,186

社区成员

发帖
与我相关
我的任务
社区描述
C++ 语言相关问题讨论,技术干货分享,前沿动态等
c++ 技术论坛(原bbs)
社区管理员
  • C++ 语言社区
  • encoderlee
  • paschen
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
  1. 请不要发布与C++技术无关的贴子
  2. 请不要发布与技术无关的招聘、广告的帖子
  3. 请尽可能的描述清楚你的问题,如果涉及到代码请尽可能的格式化一下

试试用AI创作助手写篇文章吧