排序算法（二）计数与插入

计数排序

输入增强：对问题的部分或全部输入做预处理，然后将获得的额外信息进行存储，以加速后面问题的求解。（以空间换时间）。

计数排序：针对待排序列表中的每一个元素，算出列表中小于该元素的元素个数，并将结果记录在一张表中。这个“个数”指出了该元素在有序表中的位置。当几个元素相同时，需要做一定的修改。

算法描述：计数排序假设n个输入元素的每一个都是在0到k区间内的一个整数。当k=O(n)时，排序的运行时间为 $\Theta(n)$ 。假设输入是一个数组A[1,...,n]，需要两个额外的数组：B[1,...,n]存放排序的输出，C[0,...,k]提供临时存储空间。

这个其实应该叫做分布计数排序。

伪代码：

如果n个元素都是互异的，对于每一个A[j]值来说，C[A[j]]-1就是A[j]在输出数组中最终正确位置。因为所有元素可能并不是互异的，每将一个值放入数组output中以后，将C[A[j]]的值减1。这样，当遇到下一个值等于A[j]的输入元素时，该元素可以直接放到输出数组output中A[j]的前一个位置。

算法复杂度分析：第一次for循环所花时间为 $\Theta(k)$ ，第二次循环所花时间为 $\Theta(n)$ ，第三次循环所花时间为 $\Theta(k)$ ，第四次循环所花时间为 $\Theta(n)$ 。总的时间代价为 $\Theta(n+k)$ 。在实际工作中，当 $k=O(n)$ 时，运行时间为 $\Theta(n)$ 。

#include <stdio.h>

#include <string.h>

#include <cstring>

#define RANGE 255

int strl(char *s){

int i,k=0;

for(i=0;s[i];i++)k++; //finde s[i]=’\0’

return k;

}

// The main function that sort the given string str in alphabatical order

void countSort(char *str,char *output){

// The output character array that will have sorted str

// Create a count array to store count of inidividul characters and

// initialize count array as 0

int count[RANGE + 1], i;

memset(count, 0, sizeof(count));

// Store count of each character

for(i = 0; str[i]; ++i)

++count[str[i]];

// Change count[i]so that count[i] now contains actual position of this character in output array

for (i = 1; i <= RANGE; ++i)

count[i] += count[i-1];

// Build the output character array

for (i = 0; str[i]; ++i){

output[count[str[i]]-1] = str[i];

--count[str[i]];

}

// Copy the output array to str, so that str now contains sorted characters

for (i = 0; str[i]; ++i)

str[i] = output[i];

}

int _tmain(int argc, _TCHAR* argv[]){

char str[] = "geeksforgeeks";//"applepp";

int n=strl(str);

char *output=new char(n);

countSort(str,output);

printf("Sorted string is %s\n", str);

return 0;

}

给定一个大文件，内含5亿个整数，每个整数都属于1-9999999之间。请设计方案，对这些元素进行排序，并将排序结果写成文件输出。可用内存不超过2G。

注意到元素一共有5亿个，因此，文件中肯定存在重复元素。可以考虑使用CountingSort（计数排序），统计每个元素出现的次数。

#define MAX 9999999

int Count[MAX + 1] = { 0 };

int _tmain(int argc, _TCHAR* argv[]){

char filename_in[] = "D:\\Database\\500m_merged\\500m_merged.txt";

clock_t start,finish;

double totaltime;

start=clock();

FILE *fp_in = fopen(filename_in, "r");

char content[10];

while (!feof(fp_in)){

fgets(content,10, fp_in);

int element = atoi(content);

++Count[element]; //counting...

}

fclose(fp_in);

char filename_out[] = "D:\\Database\\500m_merged\\out.txt";

FILE *fp_out = fopen(filename_out, "w");

for (int i = 1; i <= MAX;i++){

for (int j = 1; j <= Count[i]; j++){

fprintf(fp_out, "%d\n", i);

}

fclose(fp_out);

finish=clock();

totaltime=(double)(finish-start)/CLOCKS_PER_SEC;

cout<<"\n 运行时间"<<totaltime<<"s"<<endl;

return 0;

}

插入排序（升序）

思想：对于数组A[0,...,n-1]排序，假设对较小数组A[0,...,n-2]已经排好序，显然我们要做的就是在这些有序的元素中为A[n-1]找到一个合适的位置，然后把它插入。一般来说，即从右到左扫描这个这个有序的子数组，直到遇到第一个小于等于A[n-1]的元素，然后把它插在这个元素的后面。这里我们采用自底向上的迭代算法。

伪代码：

说明：首先排序的是A[0,1]这个子数组，如(d)图所示就很好的表示了A[j+1]<--A[j],j<--j-1这个过程。如果A[j]<=v，即找到了那个插入的位置，则在这个元素后面这个位置插入v值。值得注意的是上述伪代码中其实有一个很大的BUG，当j=-1时，while会做A[j]>v这个判断，这时候就会发生异常。所以最好还是用for循环。

在每次for循环的迭代开始时，包含元素A[0,...,j-1]的子数组构成了当前排好序的左手中的牌，剩余的子数组A[j+1,n-1]对应于仍在桌子上的牌。

void InterSectSort(Element **Array,int N,ComFunc Compare){

int i;

for(i=1;i<N;i++){

Element*temp;

temp=Array[i];

int j;

//注意到 j>=0,这个条件满足那么j会自减1，然后跳出for循环

for(j=i-1;j>=0;j--){

if(Compare(Array[j],temp)>0){

Array[j+1]=Array[j];

}

else //Found the Position

break;

}

Array[j+1]=temp;

}

算法导论中的示意图（下标从1开始）如下：

算法复杂度分析：算法中的基本操作就是键值比较A[j]>v。在最坏情况下，A[j]>v的执行次数达到最大，即对于每个j=i-1,...,0,都要执行一次，这种情况在对于每一个j都有A[j]>v=A[i]时发生。注意，在最坏的情况下，有A[0]>A[1](当i=1时),A[1]>A[2],...,A[n-2]>A[n-1](当i=n-1)。即最坏输入是一个严格递减的数组，对于这种输入的键值比较次数为

值得一提的是，在最坏情况下，插入排序和选择排序的键值比较次数是完全一致的。

最好的情况下，即在外部循环的每次迭代中，A[j]>v只执行一次。当且仅当对于每一个i=1,...,n-1,都有A[i-1]<=A[i]。即输入数组已经按照升序排列。对于有序的数组，键值比较次数为

int _tmain(int argc, _TCHAR* argv[]){

Element **Array;

int Items=2000;

//从文件中读取内容

char *filename="..\\IntersectSort\\test.txt";

Items=LoadArray(filename,Items,&Array);

InterSectSort(Array,Items,(ComFunc)Cfunc);

ShowArray(Array,Items,(ComFunc)Cfunc);

return 0;

}

http://www.cnblogs.com/microsoftmvp/p/4592461.html

从FILE*对象中Load数据

//sorthdr.h

#ifndef _SORTHDR_H

#define _SORTHDR_H

typedef struct sElement{

char *text;

}Element;

typedef int(*ComFunc)(void*,void*);

#endif

//sortsub.c

#define MAX_ITEM_SIZE 500

//我也是醉了直接三级指针

int LoadArray(char *Filename,int MaxItems,Element ***Array){

FILE* infile;

//注意，这是字符串的最大长度

char buffer[MAX_ITEM_SIZE],*s;

int i;

if((infile=fopen(Filename,"r"))==NULL){

fprintf(stderr,"Can't open file%s\n",Filename);

return -1;

}

//分配为一个二级指针，这个二级指针存储的就是一个file中每一行的字符串

*Array=(Element**)malloc(sizeof(Element**)*MaxItems);

if(*Array==NULL){

fprintf(stderr,"Can't allocate array of pointers%s\n");

return -1;

}

i=0;

while(fgets(buffer,MAX_ITEM_SIZE,infile)){

if(i>=MaxItems){

printf("Entire data file nor loaded\n");

break;//判断内存限制

}

s=buffer+strlen(buffer);

while(iscntrl(*s))

*s--=0;

(*Array)[i]=(Element*)malloc(sizeof(Element));

if((*Array)[i]==NULL){

fprintf(stderr,"Can't get memory for data\n");

return -1;

}

(*Array)[i]->text=(char*)malloc(strlen(buffer)+1);

if((*Array)[i]->text==NULL){

fprintf(stderr,"Can't get memory for data\n");

return -1;

}

strcpy((*Array)[i++]->text,buffer);

}

//特殊情况考虑，如果数组只有一个item，且为一个空字符串，返回空数组

if(i==1&&*((*Array)[0]->text)==0)

i=0;

fclose(infile);

return i;

}

void ShowArray(Element **Array,int Items,ComFunc Compare){

int i=0,sorted=1,column=1;

for(i=0;i<Items;i++){

if(column>61){

printf("\n");

column=1;

}

else while((column-1)%20){

printf(" ");

column+=1;

}

printf("%3d: %s",i,Array[i]->text);

column+=5+strlen(Array[i]->text);

if(i>0){

if(Compare(Array[i-1],Array[i])>0)

sorted=0;

}

if(sorted)

printf("\n\nThe array is sorted.\n");

else

printf("\n\nThe array is not sorted.\n");

}

希尔排序

希尔排序是插入排序的一个变体。插入排序之所以缓慢的原因是：它一次只把一个项目称过一组有序的记录。就是说，如果记录离他们的最终位置很远，则只有在经过多次交换后才能正确地定位它们。解决：在排序的早期阶段允许记录跳转较大的距离。

基本思想：把记录分成几个交替的组，然后使用插入排序算法对每个组进行排序。下面的例子非常清晰的阐述了这个基本思想。

为什么说这是对普通的插入排序的改进呢？我们注意观察A的移动。当它从完全错误的位置开始移动时，在前两次移动的每一次移动中，它将跳过相距其位置一般的距离，只需要三次交换即可到达其目标位置。

这种每隔h个记录选择一个记录以便反复再分为交替记录集的方式称为希尔排序。选择合适的h值序列需要做大量的工作。编程时采用以下模式来确定h的值序列：

设 $h_{1}=1$ ，n=待排序的记录数

设 $h_{s+1}=3h_{s}+1$ ，当 $h>n/9$ 时停止。

讨论：虽然希尔排序不具有像插入排序或冒泡排序处理已经有序的记录时那样引人注目的速度，但是在处理逆序或随机顺序的记录时也不会遭受显著的速度降级。实际上，希尔排序的运行时间对输入数据相当不敏感。希尔排序的唯一负面特性就是：它是一种不稳定的排序。希尔排序成为几乎任何情况下的一个优秀的选择。

void ShellSort(int Array[],int N){

int step,h;

for(h=1;h<=N/9;h=3*h+1)

;

for(;h>0;h/=3){

for(step=h;step<N;step++){

int j;

int temp=0;

temp=Array[step];

for(j=step-h;j>=0;j-=h){

if(Array[j]>temp){

Array[j+h]=Array[j];

}

else //Found the Position

break;

}

Array[j+h]=temp;

}

排序算法（二）计数与插入

计数排序

插入排序（升序）

相关问题：

对链表插入排序

希尔排序