Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

实现了关键词提取的textrank算法,请yanyi check一下。另外,idf的方法和textrank哪个更优? #41

Open
CasyWang opened this issue Jul 9, 2015 · 3 comments

Comments

@CasyWang
Copy link

CasyWang commented Jul 9, 2015

/*
 * TextRank.hpp
 *
 *  Created on: 2015年7月7日
 *      Author: oliverlwang
 */
#ifndef TEXTRANK_H_
#define TEXTRANK_H_

#include "UndirectWeightedGraph.hpp"

namespace CppJieba
{

class TextRank
{
public:
    TextRank() : _span(2) {};
    virtual ~TextRank(){};

    /**
     * @brief get the TopN Keywords.
     *
     * @param vector words input words
     * @param vector keywords keywords with score
     * @param int topN how many keywords do you want
     *
     * @retval
     */
    int textRank(vector<string>& words, map<string, double>& wordmap)
    {
        try
        {
            UndirectWeightedGraph graph;
            map< pair<string, string>, double> cm;

            for(size_t i = 0; i < words.size(); ++i)
            {
                /* syntactic filter */

                /* ngram, when span=2, f-measure gets best result */
                for(size_t j = i + 1; j < i + _span; ++j)
                {
                    if(j >= words.size())
                        break;
                    /* using std::pair as union key */
                    pair<string, string> key = make_pair(words[i], words[j]);
                    cm[key] += 1.0;
                }
            }

            /* add edge */
            for(map< pair<string, string>, double>::iterator it = cm.begin(); it != cm.end(); ++it)
            {
                /* do not add edge between the same vertex */
                if(it->first.first == it->first.second)
                    continue;

                graph.addEdge(it->first.first, it->first.second, it->second);
            }

            /* rank */
            graph.rank();

            wordmap.clear();
            wordmap = graph.ws;
        }
        catch(exception &e)
        {
            cerr << e.what() << endl;
            return -1;
        }
        return 0;
    }

private:
    int _span;             /* scanning span */


};

} /* namespace CppJieba */
#endif /* TEXTRANK_H_ */


/*
 * UndirectWeightedGraph.hpp
 *
 *  Created on: 2015年7月7日
 *      Author: oliverlwang
 */

#ifndef UNDIRECTWEIGHTEDGRAPH_H_
#define UNDIRECTWEIGHTEDGRAPH_H_

#include <iostream>
#include <algorithm>
#include <vector>
#include <map>
#include <set>

using namespace std;

namespace CppJieba
{

/* edge type */
struct edge_t
{
    string start;
    string end;
    double weight;
};

class UndirectWeightedGraph
{
public:
    UndirectWeightedGraph(): _dampingFactor(0.85){};
    virtual ~UndirectWeightedGraph(){};

    /**
     * @brief add an edge for the UndirectedWeighted Graph
     *
     * @param string &start
     * @param  string &end
     * @param
     *
     * @retval
     */
    void addEdge(const string &start, const string &end, double weight)
    {
        /* add an out edge for vertex start */
        edge_t _edge;
        _edge.start = start;
        _edge.end = end;
        _edge.weight = weight;

        _graph[start].push_back(_edge);

        /* add an out edge for vertex end */
        _edge.start = end;
        _edge.end = start;

        _graph[end].push_back(_edge);
    }

    /**
     * @brief rank the words according to its score
     *
     * @param none
     *
     * @retval none
     */
    void rank()
    {
        map<string, double> outSum;

        /* initialize words score */
        double wsdef = (_graph.size() > 0) ? (1.0 / _graph.size()) : 1.0;

        for(map<string, vector<edge_t> >::iterator it = _graph.begin(); it != _graph.end(); ++it)
        {
            ws[it->first] = wsdef;
            outSum[it->first] = weightOutSum(it->second);
        }

        /* iterator 10 times */
        for(int i = 0; i < 10; ++i)
        {
            /* stl map is sorted by key by default */
            for(map<string, vector<edge_t> >::iterator i = _graph.begin(); i != _graph.end(); ++i)
            {
                double score = 0.0;
                for(vector<edge_t>::iterator j = i->second.begin(); j != i->second.end(); ++j)
                {
                    score += j->weight / outSum[j->end] * ws[j->end];
                }

                ws[i->first] = (1.0 - _dampingFactor) + _dampingFactor * score;
            }
        }

        /* normalize */
        double max_rank = max_element(ws.begin(), ws.end())->second;
        double min_rank = min_element(ws.begin(), ws.end())->second;

        for(map<string, double>::iterator m = ws.begin(); m != ws.end(); ++m)
        {
            m->second = (m->second - min_rank / 10.0) / (max_rank - min_rank / 10.0);
        }
    }

public:
    /* words score */
    map<string, double> ws;

private:
    /* Graph, key: vertex which is a words or term, value: In(vertex) */
      map<string, vector<edge_t> > _graph;

    /* d is the damping factor that can be set between 0 and 1, always set to 0.85 */
    double _dampingFactor;

private:
    /* calculate the weight sum of out edge */
    double weightOutSum(const vector<edge_t>& v)
    {
        double sum = 0.0;
        for(vector<edge_t>::const_iterator it = v.begin(); it != v.end(); ++it)
        {
            sum += it->weight;
        }
        return sum;
    }

};

}/* namespace CppJieba */

#endif /* UNDIRECTWEIGHTEDGRAPH_H_ */

@yanyiwu
Copy link
Owner

yanyiwu commented Jul 9, 2015

非常感谢!
不过如果能作为一次pull request提交是否更好一些?

@w32zhong
Copy link
Contributor

w32zhong commented May 5, 2016

TextRank的计算复杂度那么高,想必提出来一定是有效果上的优势的。

Copy link

github-actions bot commented Sep 8, 2024

This issue has not been updated for over 5 years and will be marked as stale. If the issue still exists, please comment or update the issue, otherwise it will be closed after 7 days.

@github-actions github-actions bot added the Stale label Sep 8, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants