从一亿个数中找出最大的一万个数

问题定义：

从一亿个数中找出最大的一万个数

不假思索：

拿到这道题，马上就会想到的方法是建立一个数组把1亿个数装起来，然后用for循环遍历这个数组，找出最大的1万个数来。原因很简单，如果要找最大的那个数，就是这样解决的；而找最大的一万个数，只是重复一万遍。这个解法类似于选择排序，一次将一个正确解放在合适的位置，重复一万次，所以时间复杂度为O(n *m)，如果你愿意去实现一下，会发现不等个几十分钟是不会出结果的。

稍做思考：

上面的解决方案类似于一个选择排序，而我们知道，所有排序算法中选择排序是比较慢的，所以我们选择快速排序，将整个数组都排好续，然后取前一万个数就是我们想要的结果，solution1就是采用这种方法，将时间减少到只要几秒，可见快速排序真的很快。

深入思考：

上面的方法已经有一个不错的效率了，但是，这里我们做了很多无用功，题目只要求我们将前一万个最大的数找到，我们却排序了整个数组，稍微相一下就知道还有很大的优化空间。方法二是解设前一万个数是我们需要找的一万个数，但是假设不一定成立，那么，我们只需要讲后续元素与前一万个数中的最小元素比较，如果最小元素比后续元素小，则交换数据，这样只需要遍历一遍大数组就能得到正确解。时间复杂度为O(n).solution2就是采用这种方法

深思熟虑：

solution3的基本思路和solution2是一样的，不过做了两点优化。在solution2中，我们需要不断的找出前一万个数中最小的元素，是否可以进行优化呢？答案是肯定的，优化的方法就是维持前一万个数基本有序，则每次只需要查找一个很小的范围，就能找到前一万个数中最小的元素。还有点优化就是，前一万个数无序的时候，查找时间就变长了，就退化成solution2了，所以再交换600次数据以后，再对前一万个数进行排序，这样能提高查找最小元素的时间。

不断否定自己的方法：

solution4测试了使用mutilset来保存前一万个数的情况，我们知道mutilset采用的是红黑树，且容器中的元素是有序的，正好符合我们的需求，实际测试发现，先率没有solution3快。

[cpp]view plaincopy
#include <stdio.h>  
#include <stdlib.h>  
#include <string.h>  
#include <set>  
#include <algorithm>  
#include <iostream>  
#include <sys/times.h>  
#include <unistd.h>  
#define DEBUG  
using namespace std;  
  
multiset<int> s;  
const int BIG_ARR_SIZE = 100000000;  
const int RESULT_ARR_SIZE = 10000;  
bool compre( int a, int b)  
{  
        return a > b;  
}  
  
void solution1( int *buff)  
{  
        sort( buff, buff + BIG_ARR_SIZE, compre);  
}  
  
void swap( int *buff, int i, int j)  
{  
        int temp = buff[i];  
        buff[i] = buff[j];  
        buff[j] = temp;  
}  
void solution2( int *buff)  
{  
        bool bExchanged = true;  
        int i, idx, j;  
  
        for( i = RESULT_ARR_SIZE; i < BIG_ARR_SIZE; i++)  
        {  
                if( bExchanged )  
                {  
                        //找出前一万个数中最小的  
                        for( idx = 0, j = 1; j < RESULT_ARR_SIZE; j++)  
                        {  
                                if( buff[idx] > buff[j])  
                                        idx = j;  
                        }  
                }  
                //在后续元素比前一万个元素最小元素大，则替换  
                if( buff[i] > buff[idx])  
                {  
                        swap( buff, i, idx);  
                        bExchanged = true;  
                }  
                else  
                {  
                        bExchanged = false;  
                }  
        }  
}  
  
void solution3( int *buff)  
{  
        sort(buff, buff+RESULT_ARR_SIZE, compre);  
        int minElemIdx = RESULT_ARR_SIZE -1;  
        int zoneBeginIdx = minElemIdx;  
        for( int i = RESULT_ARR_SIZE; i < BIG_ARR_SIZE; i++)  
        {  
                if( buff[i] > buff[minElemIdx] )  
                {  
                        swap(buff, i, minElemIdx);  
                        if( minElemIdx == zoneBeginIdx )  
                        {  
                                zoneBeginIdx--;  
                                if( zoneBeginIdx < RESULT_ARR_SIZE - 600 )  
                                {  
                                        sort( buff, buff + RESULT_ARR_SIZE, compre);  
                                        zoneBeginIdx = minElemIdx = RESULT_ARR_SIZE - 1;  
                                }  
                                continue;  
                        }  
  
                        int idx = zoneBeginIdx;  
                        int j = idx + 1;  
                        //查找最小元素      
                        for(; j < RESULT_ARR_SIZE; j++)  
                        {  
                                if( buff[idx] > buff[j])  
                                        idx = j;  
                        }  
                        minElemIdx = idx;  
                }  
        }  
}  
  
  
void solution4(int *buff)  
{  
        int i;  
        for( i = 0; i < RESULT_ARR_SIZE; i++)  
        {  
                s.insert( buff[i]);  
        }  
  
        for(i = RESULT_ARR_SIZE; i < BIG_ARR_SIZE; i++)  
        {  
                if( buff[i] > *s.begin() )  
                {  
                        s.insert(buff[i]);  
                        s.erase( s.begin());      
                }  
        }  
}  
  
int main(int argc, char* argv[])  
{  
  
        //生成1亿个整数  
        int deno = BIG_ARR_SIZE * 10;  
        int i;  
        int *buff;  
        int *backup;  
        struct tms begTime, endTime;  
        int result[RESULT_ARR_SIZE];  
        long beg,end;  
        buff = new int[BIG_ARR_SIZE];  
        backup = new int[BIG_ARR_SIZE];  
  
        int clocks_per_sec = sysconf( _SC_CLK_TCK);  
        srand(time(0));  
        for( i = 0; i < BIG_ARR_SIZE; i++ )  
        {  
                buff[i] = rand() % deno;  
        }  
  
        //将原始数据备份，其他解决方案使用同样的数据，才有可比性  
        memcpy(backup, buff, sizeof(int) * BIG_ARR_SIZE);  
  
#ifdef DEBUG  
        beg = times(&begTime);   
        solution1(buff);  
        end = times(&endTime);   
        cout << "1.Use sort algorithm in STL: " << (end - beg) * 1000 / clocks_per_sec << " ms" << endl;  
        for( i = 0; i <9; i++)  
        {  
                cout<< buff[i] << "\t";  
        }  
        cout << endl;  
#endif  
  
        memcpy( buff, backup, sizeof(int) * BIG_ARR_SIZE);  
        beg = times(&begTime);   
        solution2(buff);  
        end = times(&endTime);   
        cout << "2.Assume the first 10000 numbers is larger then the later number: " << (end - beg) * 1000 / clocks_per_sec  << " ms" << endl;  
#ifdef DEBUG  
        sort( buff, buff + RESULT_ARR_SIZE, compre);  
        for( i = 0; i <9; i++)  
        {  
                cout<< buff[i] << "\t";  
        }  
        cout << endl;  
  
#endif  
        memcpy( buff, backup, sizeof(int) * BIG_ARR_SIZE);  
        beg = times(&begTime);   
        solution3(buff);  
        end = times(&endTime);   
        cout << "3.Assume the first 10000 numbers is larger then the later number(with optimization): " << (end - beg) * 1000 / clocks_per_sec  << " ms" << endl;  
  
#ifdef DEBUG  
        sort( buff, buff + RESULT_ARR_SIZE, compre);  
        for( i = 0; i <9; i++)  
        {  
                cout<< buff[i] << "\t";  
        }  
        cout << endl;  
#endif  
        memcpy( buff, backup, sizeof(int) * BIG_ARR_SIZE);  
        beg = times(&begTime);   
        solution4(buff);  
        end = times(&endTime);   
        cout << "4.Assume the first 10000 numbers is larger then the later number(with multiset): " << (end - beg) * 1000 / clocks_per_sec  << " ms" << endl;  
  
        return 0;  
}  

测试结果：

从一亿个数中找出最大的一万个数

结论：

经过层层优化，这个难题只需要半秒时间就解决了，可见算法好坏对于程序效率的影响是巨大的。

参考资料：

赖永浩 . 算法优化 . 某某杂志期刊.51--54.

从一亿个数中找出最大的一万个数

相关推荐